コード例 #1
0
 def setUp(self):
     self.out_fname = test_utils.test_tmpfile('output.vcf')
     self.header = variants_pb2.VcfHeader(
         contigs=[
             reference_pb2.ContigInfo(name='Chr1',
                                      n_bases=50,
                                      pos_in_fasta=0),
             reference_pb2.ContigInfo(name='Chr2',
                                      n_bases=25,
                                      pos_in_fasta=1),
         ],
         sample_names=['Fido', 'Spot'],
         formats=[
             variants_pb2.VcfFormatInfo(id='GT',
                                        number='1',
                                        type='String',
                                        description='Genotype'),
             variants_pb2.VcfFormatInfo(id='GQ',
                                        number='1',
                                        type='Float',
                                        description='Genotype Quality')
         ],
     )
     self.options = variants_pb2.VcfWriterOptions()
     self.writer = vcf_writer.VcfWriter.to_file(self.out_fname, self.header,
                                                self.options)
     self.variant = test_utils.make_variant(
         chrom='Chr1',
         start=10,
         alleles=['A', 'C'],
     )
     self.variant.calls.extend([
         variants_pb2.VariantCall(genotype=[0, 0], call_set_name='Fido'),
         variants_pb2.VariantCall(genotype=[0, 1], call_set_name='Spot'),
     ])
コード例 #2
0
 def test_parse_literal_with_contig_map(self, contig_name, expected):
   contig_map = {
       'chr1': reference_pb2.ContigInfo(name='chr1', n_bases=10),
       'chr2': reference_pb2.ContigInfo(name='chr2', n_bases=5),
   }
   self.assertEqual(
       ranges.parse_literal(contig_name, contig_map=contig_map), expected)
コード例 #3
0
ファイル: io_utils_test.py プロジェクト: cgpu/deepvariant-1
 def setUp(self):
     self.proto1 = reference_pb2.ContigInfo(name='p1',
                                            n_bases=10,
                                            pos_in_fasta=0)
     self.proto2 = reference_pb2.ContigInfo(name='p2',
                                            n_bases=20,
                                            pos_in_fasta=1)
     self.protos = [self.proto1, self.proto2]
コード例 #4
0
 def test_from_contigs(self):
   contigs = [
       reference_pb2.ContigInfo(name='chr1', n_bases=10),
       reference_pb2.ContigInfo(name='chr2', n_bases=5),
   ]
   self.assertCountEqual([
       ranges.make_range('chr1', 0, 10),
       ranges.make_range('chr2', 0, 5),
   ], ranges.RangeSet.from_contigs(contigs))
コード例 #5
0
 def test_from_regions_not_empty(self):
   literals = ['chr1', 'chr2:10-20']
   contig_map = {
       'chr1': reference_pb2.ContigInfo(name='chr1', n_bases=10),
       'chr2': reference_pb2.ContigInfo(name='chr2', n_bases=100),
   }
   self.assertItemsEqual(
       [ranges.make_range('chr1', 0, 10),
        ranges.make_range('chr2', 9, 20)],
       ranges.RangeSet.from_regions(literals, contig_map))
コード例 #6
0
 def test_contigs_n_bases(self):
   c1 = reference_pb2.ContigInfo(name='c', n_bases=100, pos_in_fasta=0)
   c2 = reference_pb2.ContigInfo(name='a', n_bases=50, pos_in_fasta=1)
   c3 = reference_pb2.ContigInfo(name='b', n_bases=25, pos_in_fasta=2)
   self.assertEqual(100, ranges.contigs_n_bases([c1]))
   self.assertEqual(50, ranges.contigs_n_bases([c2]))
   self.assertEqual(25, ranges.contigs_n_bases([c3]))
   self.assertEqual(150, ranges.contigs_n_bases([c1, c2]))
   self.assertEqual(125, ranges.contigs_n_bases([c1, c3]))
   self.assertEqual(175, ranges.contigs_n_bases([c1, c2, c3]))
コード例 #7
0
ファイル: fasta.py プロジェクト: cgpu/deepvariant-1
  def __init__(self, chromosomes):
    """Initializes an InMemoryFastaReader using data from chromosomes.

    Args:
      chromosomes: list[tuple]. The chromosomes we are caching in memory as a
        list of tuples. Each tuple must be exactly three elements in length,
        containing (chromosome name [str], start [int], bases [str]).

    Raises:
      ValueError: If any of the chromosomes tuples are invalid.
    """
    super(InMemoryFastaReader, self).__init__()

    ref_seqs = []
    contigs = []
    for i, (contig_name, start, bases) in enumerate(chromosomes):
      if start < 0:
        raise ValueError('start={} must be >= for chromosome={}'.format(
            start, contig_name))
      if not bases:
        raise ValueError(
            'Bases must contain at least one base, but got "{}"'.format(bases))

      end = start + len(bases)
      ref_seqs.append(reference_pb2.ReferenceSequence(
          region=ranges.make_range(contig_name, start, end), bases=bases))
      contigs.append(
          reference_pb2.ContigInfo(
              name=contig_name, n_bases=end, pos_in_fasta=i))

    self._reader = in_memory_fasta_reader.InMemoryFastaReader.create(
        contigs, ref_seqs)
    self.header = RefFastaHeader(contigs=self._reader.contigs)
コード例 #8
0
ファイル: fasta.py プロジェクト: zuxfoucault/deepvariant
  def __init__(self, chromosomes):
    """Initializes an InMemoryRefReader using data from chromosomes.

    Args:
      chromosomes: List[tuple]. The chromosomes we are caching in memory as a
        list of tuples. Each tuple must be exactly three string elements in
        length, containing (chromosome name, start, bases).

    Raises:
      ValueError: If any of the InMemoryChromosome are invalid.
    """
    super(InMemoryRefReader, self).__init__()

    self._chroms = {}
    contigs = []
    for i, (contig_name, start, bases) in enumerate(chromosomes):
      if start < 0:
        raise ValueError('start={} must be >= for chromosome={}'.format(
            start, contig_name))
      if contig_name in self._chroms:
        raise ValueError('Duplicate chromosome={} detect'.format(contig_name))
      if not bases:
        raise ValueError(
            'Bases must contain at least one base, but got "{}"'.format(bases))

      end = start + len(bases)
      self._chroms[contig_name] = _InMemoryChromosome(start, end, bases)
      contigs.append(
          reference_pb2.ContigInfo(
              name=contig_name, n_bases=end, pos_in_fasta=i))

    self.header = RefFastaHeader(contigs=contigs)
コード例 #9
0
 def test_parse_literal_with_contig_map_and_bad_input_raises_exception(
     self, bad_literal):
   with self.assertRaises(ValueError):
     ranges.parse_literal(
         bad_literal,
         contig_map={
             'chr1': reference_pb2.ContigInfo(name='chr1', n_bases=10)
         })
コード例 #10
0
 def setUp(self):
     self.variants = [
         test_utils.make_variant(chrom='1', start=10),
         test_utils.make_variant(chrom='1', start=20),
         test_utils.make_variant(chrom='1', start=30),
         test_utils.make_variant(chrom='2', start=25),
         test_utils.make_variant(chrom='2', start=55),
         test_utils.make_variant(chrom='3', start=10),
     ]
     self.header = variants_pb2.VcfHeader(contigs=[
         reference_pb2.ContigInfo(name='1', n_bases=100),
         reference_pb2.ContigInfo(name='2', n_bases=100),
         reference_pb2.ContigInfo(name='3', n_bases=100),
         reference_pb2.ContigInfo(name='4', n_bases=100),
     ],
                                          filters=[],
                                          sample_names=['NA12878'])
     self.reader = vcf.InMemoryVcfReader(self.variants, self.header)
コード例 #11
0
class DvVcfConstantsTest(parameterized.TestCase):
    @parameterized.parameters(
        dict(contigs=[], sample_names=[]),
        dict(contigs=[reference_pb2.ContigInfo(name='chr1')],
             sample_names=['single_sample']),
        dict(contigs=[
            reference_pb2.ContigInfo(name='1'),
            reference_pb2.ContigInfo(name='2')
        ],
             sample_names=['multiple', 'samples']),
    )
    def test_deepvariant_header(self, contigs, sample_names):
        header = dv_vcf_constants.deepvariant_header(contigs=contigs,
                                                     sample_names=sample_names)
        self.assertCountEqual(header.contigs, contigs)
        self.assertCountEqual(header.sample_names, sample_names)
        self.assertGreater(len(header.filters), 0)
        self.assertGreater(len(header.infos), 0)
        self.assertGreater(len(header.formats), 0)
コード例 #12
0
  def test_sort_ranges(self):
    contigs = [
        reference_pb2.ContigInfo(name='c', n_bases=100, pos_in_fasta=0),
        reference_pb2.ContigInfo(name='a', n_bases=76, pos_in_fasta=1),
        reference_pb2.ContigInfo(name='b', n_bases=121, pos_in_fasta=2),
    ]
    unsorted = ranges.parse_literals(
        ['a:10', 'c:20', 'b:30', 'b:10-15', 'b:10', 'a:5'])

    # Without contigs we sort the contigs by name lexicographically.
    self.assertEqual(
        ranges.parse_literals(
            ['a:5', 'a:10', 'b:10', 'b:10-15', 'b:30', 'c:20']),
        ranges.sorted_ranges(unsorted))

    # With contigs we sort by the position of the contigs themselves.
    self.assertEqual(
        ranges.parse_literals(
            ['c:20', 'a:5', 'a:10', 'b:10', 'b:10-15', 'b:30']),
        ranges.sorted_ranges(unsorted, contigs))
コード例 #13
0
 def setUp(self):
     self.read1 = test_utils.make_read(bases='ACCGT',
                                       chrom='chr1',
                                       start=10,
                                       cigar='5M',
                                       mapq=50,
                                       quals=range(30, 35),
                                       name='read1')
     self.read2 = test_utils.make_read(bases='AACCTT',
                                       chrom='chr2',
                                       start=15,
                                       cigar='7M',
                                       mapq=40,
                                       quals=range(20, 26),
                                       name='read2')
     self.contigs = [
         reference_pb2.ContigInfo(name='chr1'),
         reference_pb2.ContigInfo(name='chr2'),
     ]
     self.header = reads_pb2.SamHeader()
コード例 #14
0
def _make_contigs(specs):
    """Makes ContigInfo protos from specs.

  Args:
    specs: A list of 2- or 3-tuples. All tuples should be of the same length. If
      2-element, these should be the name and length in basepairs of each
      contig, and their pos_in_fasta will be set to their index in the list. If
      the 3-element, the tuple should contain name, length, and pos_in_fasta.

  Returns:
    A list of ContigInfo protos, one for each spec in specs.
  """
    if specs and len(specs[0]) == 3:
        return [
            reference_pb2.ContigInfo(name=name, n_bases=length, pos_in_fasta=i)
            for name, length, i in specs
        ]
    else:
        return [
            reference_pb2.ContigInfo(name=name, n_bases=length, pos_in_fasta=i)
            for i, (name, length) in enumerate(specs)
        ]
コード例 #15
0
  def test_expand_raises_with_missing_contig_in_map(self):
    # Empty contig_map should raise.
    with self.assertRaises(KeyError):
      ranges.expand(ranges.make_range('1', 10, 20), 1, contig_map={})

    # Missing '1' from the contig map should raise.
    with self.assertRaises(KeyError):
      ranges.expand(
          ranges.make_range('1', 10, 20),
          1,
          contig_map={
              '2': reference_pb2.ContigInfo(name='2', n_bases=50),
          })
コード例 #16
0
ファイル: ranges_test.py プロジェクト: palc/deepvariant
    def test_rangeset_iteration_order(self):
        contigs = [
            reference_pb2.ContigInfo(name='c', n_bases=100, pos_in_fasta=0),
            reference_pb2.ContigInfo(name='b', n_bases=121, pos_in_fasta=2),
            reference_pb2.ContigInfo(name='a', n_bases=76, pos_in_fasta=1),
        ]
        unsorted = ranges.parse_literals(
            ['a:10', 'c:20', 'b:30', 'b:10-15', 'a:5'])

        # Iteration order over a RangeSet instantiated with a contigs list is
        # determined by pos_in_fasta, start, end.
        range_set_with_contigs = ranges.RangeSet(unsorted, contigs)
        self.assertEqual(
            ranges.parse_literals(['c:20', 'a:5', 'a:10', 'b:10-15', 'b:30']),
            [range_ for range_ in range_set_with_contigs])

        # For a RangeSet instantiated *without* a contig map, the iteration order
        # is determined by reference_name, start, end.
        range_set_no_contigs = ranges.RangeSet(unsorted)
        self.assertEqual(
            ranges.parse_literals(['a:5', 'a:10', 'b:10-15', 'b:30', 'c:20']),
            [range_ for range_ in range_set_no_contigs])
コード例 #17
0
ファイル: vcf_test.py プロジェクト: PhilPalmer/deepvariant-1
 def write_variant_to_tempfile(self, variant):
   output_path = test_utils.test_tmpfile('test.vcf')
   header = variants_pb2.VcfHeader(
       contigs=[reference_pb2.ContigInfo(name='20')],
       sample_names=[call.call_set_name for call in variant.calls],
       formats=[
           variants_pb2.VcfFormatInfo(
               id='DP', number='1', type='Integer', description='Read depth'),
           variants_pb2.VcfFormatInfo(
               id='AD',
               number='R',
               type='Integer',
               description='Read depth for each allele')
       ])
   writer = vcf.VcfWriter(output_path, header=header)
   with writer:
     writer.write(variant)
   return output_path
コード例 #18
0
from __future__ import division
from __future__ import print_function



from absl.testing import absltest

from third_party.nucleus.io.python import vcf_reader
from third_party.nucleus.protos import index_pb2
from third_party.nucleus.protos import reference_pb2
from third_party.nucleus.protos import variants_pb2
from third_party.nucleus.testing import test_utils
from third_party.nucleus.util import ranges

expected_sites_contigs = [
    reference_pb2.ContigInfo(name='chr1', pos_in_fasta=0, n_bases=248956422),
    reference_pb2.ContigInfo(name='chr2', pos_in_fasta=1, n_bases=242193529),
    reference_pb2.ContigInfo(name='chr3', pos_in_fasta=2, n_bases=198295559),
    reference_pb2.ContigInfo(name='chr4', pos_in_fasta=3, n_bases=190214555),
    reference_pb2.ContigInfo(name='chr5', pos_in_fasta=4, n_bases=181538259),
    reference_pb2.ContigInfo(name='chr6', pos_in_fasta=5, n_bases=170805979),
    reference_pb2.ContigInfo(name='chr7', pos_in_fasta=6, n_bases=159345973),
    reference_pb2.ContigInfo(name='chr8', pos_in_fasta=7, n_bases=145138636),
    reference_pb2.ContigInfo(name='chr9', pos_in_fasta=8, n_bases=138394717),
    reference_pb2.ContigInfo(name='chr10', pos_in_fasta=9, n_bases=133797422),
    reference_pb2.ContigInfo(name='chr11', pos_in_fasta=10, n_bases=135086622),
    reference_pb2.ContigInfo(name='chr12', pos_in_fasta=11, n_bases=133275309),
    reference_pb2.ContigInfo(name='chr13', pos_in_fasta=12, n_bases=114364328),
    reference_pb2.ContigInfo(name='chr14', pos_in_fasta=13, n_bases=107043718),
    reference_pb2.ContigInfo(name='chr15', pos_in_fasta=14, n_bases=101991189),
    reference_pb2.ContigInfo(name='chr16', pos_in_fasta=15, n_bases=90338345),
コード例 #19
0
 def test_sam_contigs(self):
     reader = sam_reader.SamReader.from_file(reads_path=self.bam,
                                             ref_path='',
                                             options=self.options)
     with reader:
         self.assertEqual([
             reference_pb2.ContigInfo(
                 name='chrM', pos_in_fasta=0, n_bases=16571),
             reference_pb2.ContigInfo(
                 name='chr1', pos_in_fasta=1, n_bases=249250621),
             reference_pb2.ContigInfo(
                 name='chr2', pos_in_fasta=2, n_bases=243199373),
             reference_pb2.ContigInfo(
                 name='chr3', pos_in_fasta=3, n_bases=198022430),
             reference_pb2.ContigInfo(
                 name='chr4', pos_in_fasta=4, n_bases=191154276),
             reference_pb2.ContigInfo(
                 name='chr5', pos_in_fasta=5, n_bases=180915260),
             reference_pb2.ContigInfo(
                 name='chr6', pos_in_fasta=6, n_bases=171115067),
             reference_pb2.ContigInfo(
                 name='chr7', pos_in_fasta=7, n_bases=159138663),
             reference_pb2.ContigInfo(
                 name='chr8', pos_in_fasta=8, n_bases=146364022),
             reference_pb2.ContigInfo(
                 name='chr9', pos_in_fasta=9, n_bases=141213431),
             reference_pb2.ContigInfo(
                 name='chr10', pos_in_fasta=10, n_bases=135534747),
             reference_pb2.ContigInfo(
                 name='chr11', pos_in_fasta=11, n_bases=135006516),
             reference_pb2.ContigInfo(
                 name='chr12', pos_in_fasta=12, n_bases=133851895),
             reference_pb2.ContigInfo(
                 name='chr13', pos_in_fasta=13, n_bases=115169878),
             reference_pb2.ContigInfo(
                 name='chr14', pos_in_fasta=14, n_bases=107349540),
             reference_pb2.ContigInfo(
                 name='chr15', pos_in_fasta=15, n_bases=102531392),
             reference_pb2.ContigInfo(
                 name='chr16', pos_in_fasta=16, n_bases=90354753),
             reference_pb2.ContigInfo(
                 name='chr17', pos_in_fasta=17, n_bases=81195210),
             reference_pb2.ContigInfo(
                 name='chr18', pos_in_fasta=18, n_bases=78077248),
             reference_pb2.ContigInfo(
                 name='chr19', pos_in_fasta=19, n_bases=59128983),
             reference_pb2.ContigInfo(
                 name='chr20', pos_in_fasta=20, n_bases=63025520),
             reference_pb2.ContigInfo(
                 name='chr21', pos_in_fasta=21, n_bases=48129895),
             reference_pb2.ContigInfo(
                 name='chr22', pos_in_fasta=22, n_bases=51304566),
             reference_pb2.ContigInfo(
                 name='chrX', pos_in_fasta=23, n_bases=155270560),
             reference_pb2.ContigInfo(
                 name='chrY', pos_in_fasta=24, n_bases=59373566),
         ], list(reader.header.contigs))
コード例 #20
0
    def test_writing_canned_variants(self):
        """Tests writing all the variants that are 'canned' in our tfrecord file."""
        # This file is in the TF record format
        tfrecord_file = test_utils.genomics_core_testdata(
            'test_samples.vcf.golden.tfrecord')

        writer_options = variants_pb2.VcfWriterOptions()
        header = variants_pb2.VcfHeader(
            contigs=[
                reference_pb2.ContigInfo(name='chr1', n_bases=248956422),
                reference_pb2.ContigInfo(name='chr2', n_bases=242193529),
                reference_pb2.ContigInfo(name='chr3', n_bases=198295559),
                reference_pb2.ContigInfo(name='chrX', n_bases=156040895)
            ],
            sample_names=['NA12878_18_99'],
            filters=[
                variants_pb2.VcfFilterInfo(id='PASS',
                                           description='All filters passed'),
                variants_pb2.VcfFilterInfo(id='LowQual', description=''),
                variants_pb2.VcfFilterInfo(id='VQSRTrancheINDEL95.00to96.00'),
                variants_pb2.VcfFilterInfo(id='VQSRTrancheINDEL96.00to97.00'),
                variants_pb2.VcfFilterInfo(id='VQSRTrancheINDEL97.00to99.00'),
                variants_pb2.VcfFilterInfo(id='VQSRTrancheINDEL99.00to99.50'),
                variants_pb2.VcfFilterInfo(id='VQSRTrancheINDEL99.50to99.90'),
                variants_pb2.VcfFilterInfo(id='VQSRTrancheINDEL99.90to99.95'),
                variants_pb2.VcfFilterInfo(
                    id='VQSRTrancheINDEL99.95to100.00+'),
                variants_pb2.VcfFilterInfo(id='VQSRTrancheINDEL99.95to100.00'),
                variants_pb2.VcfFilterInfo(id='VQSRTrancheSNP99.50to99.60'),
                variants_pb2.VcfFilterInfo(id='VQSRTrancheSNP99.60to99.80'),
                variants_pb2.VcfFilterInfo(id='VQSRTrancheSNP99.80to99.90'),
                variants_pb2.VcfFilterInfo(id='VQSRTrancheSNP99.90to99.95'),
                variants_pb2.VcfFilterInfo(id='VQSRTrancheSNP99.95to100.00+'),
                variants_pb2.VcfFilterInfo(id='VQSRTrancheSNP99.95to100.00'),
            ],
            infos=[
                variants_pb2.VcfInfo(
                    id='END',
                    number='1',
                    type='Integer',
                    description='Stop position of the interval')
            ],
            formats=[
                variants_pb2.VcfFormatInfo(id='GT',
                                           number='1',
                                           type='String',
                                           description='Genotype'),
                variants_pb2.VcfFormatInfo(id='GQ',
                                           number='1',
                                           type='Integer',
                                           description='Genotype Quality'),
                variants_pb2.VcfFormatInfo(
                    id='DP',
                    number='1',
                    type='Integer',
                    description='Read depth of all passing filters reads.'),
                variants_pb2.VcfFormatInfo(
                    id='MIN_DP',
                    number='1',
                    type='Integer',
                    description='Minimum DP observed within the GVCF block.'),
                variants_pb2.VcfFormatInfo(
                    id='AD',
                    number='R',
                    type='Integer',
                    description=
                    'Read depth of all passing filters reads for each allele.'
                ),
                variants_pb2.VcfFormatInfo(
                    id='VAF',
                    number='A',
                    type='Float',
                    description='Variant allele fractions.'),
                variants_pb2.VcfFormatInfo(
                    id='PL',
                    number='G',
                    type='Integer',
                    description='Genotype likelihoods, Phred encoded'),
            ],
        )
        variant_records = list(
            io_utils.read_tfrecords(tfrecord_file, proto=variants_pb2.Variant))
        out_fname = test_utils.test_tmpfile('output.vcf')
        with vcf_writer.VcfWriter.to_file(out_fname, header,
                                          writer_options) as writer:
            for record in variant_records[:5]:
                writer.write(record)

        # Check: are the variants written as expected?
        # pylint: disable=line-too-long
        expected_vcf_content = [
            '##fileformat=VCFv4.2\n',
            '##FILTER=<ID=PASS,Description="All filters passed">\n',
            '##FILTER=<ID=LowQual,Description="">\n',
            '##FILTER=<ID=VQSRTrancheINDEL95.00to96.00,Description="">\n',
            '##FILTER=<ID=VQSRTrancheINDEL96.00to97.00,Description="">\n',
            '##FILTER=<ID=VQSRTrancheINDEL97.00to99.00,Description="">\n',
            '##FILTER=<ID=VQSRTrancheINDEL99.00to99.50,Description="">\n',
            '##FILTER=<ID=VQSRTrancheINDEL99.50to99.90,Description="">\n',
            '##FILTER=<ID=VQSRTrancheINDEL99.90to99.95,Description="">\n',
            '##FILTER=<ID=VQSRTrancheINDEL99.95to100.00+,Description="">\n',
            '##FILTER=<ID=VQSRTrancheINDEL99.95to100.00,Description="">\n',
            '##FILTER=<ID=VQSRTrancheSNP99.50to99.60,Description="">\n',
            '##FILTER=<ID=VQSRTrancheSNP99.60to99.80,Description="">\n',
            '##FILTER=<ID=VQSRTrancheSNP99.80to99.90,Description="">\n',
            '##FILTER=<ID=VQSRTrancheSNP99.90to99.95,Description="">\n',
            '##FILTER=<ID=VQSRTrancheSNP99.95to100.00+,Description="">\n',
            '##FILTER=<ID=VQSRTrancheSNP99.95to100.00,Description="">\n',
            '##INFO=<ID=END,Number=1,Type=Integer,Description="Stop position of '
            'the interval">\n',
            '##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n',
            '##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality">\n',
            '##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Read depth of all '
            'passing filters reads.">\n',
            '##FORMAT=<ID=MIN_DP,Number=1,Type=Integer,Description="Minimum DP '
            'observed within the GVCF block.">\n',
            '##FORMAT=<ID=AD,Number=R,Type=Integer,Description="Read depth of all '
            'passing filters reads for each allele.">\n',
            '##FORMAT=<ID=VAF,Number=A,Type=Float,Description=\"Variant allele '
            'fractions.">\n',
            '##FORMAT=<ID=PL,Number=G,Type=Integer,Description="Genotype '
            'likelihoods, Phred encoded">\n',
            '##contig=<ID=chr1,length=248956422>\n',
            '##contig=<ID=chr2,length=242193529>\n',
            '##contig=<ID=chr3,length=198295559>\n',
            '##contig=<ID=chrX,length=156040895>\n',
            '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tNA12878_18_99\n',
            'chr1\t13613\t.\tT\tA\t39.88\tVQSRTrancheSNP99.90to99.95\t.\tGT:GQ:DP:AD:PL\t0/1:16:4:1,3:68,0,16\n',
            'chr1\t13813\t.\tT\tG\t90.28\tPASS\t.\tGT:GQ:DP:AD:PL\t1/1:9:3:0,3:118,9,0\n',
            'chr1\t13838\trs28428499\tC\tT\t62.74\tPASS\t.\tGT:GQ:DP:AD:PL\t1/1:6:2:0,2:90,6,0\n',
            'chr1\t14397\trs756427959\tCTGT\tC\t37.73\tPASS\t.\tGT:GQ:DP:AD:PL\t0/1:75:5:3,2:75,0,152\n',
            'chr1\t14522\t.\tG\tA\t49.77\tVQSRTrancheSNP99.60to99.80\t.\tGT:GQ:DP:AD:PL\t0/1:78:10:6,4:78,0,118\n'
        ]
        # pylint: enable=line-too-long

        with gfile.GFile(out_fname, 'r') as f:
            self.assertEqual(f.readlines(), expected_vcf_content)
コード例 #21
0
from third_party.nucleus.util import genomics_math
from third_party.nucleus.util import io_utils
from third_party.nucleus.util import vcf_constants
from deepvariant import dv_vcf_constants
from deepvariant import postprocess_variants
from deepvariant import testdata
from deepvariant.protos import deepvariant_pb2
from deepvariant.testing import flagsaver

FLAGS = flags.FLAGS

_DEFAULT_SAMPLE_NAME = 'NA12878'

# Test contigs for gVCF merging code.
_CONTIGS = [
    reference_pb2.ContigInfo(name='1', n_bases=100),
    reference_pb2.ContigInfo(name='2', n_bases=200),
    reference_pb2.ContigInfo(name='10', n_bases=300),
]


def dummy_reference_reader():
  return fasta.InMemoryFastaReader(chromosomes=[
      ('1', 0, 'AACCGGTTACGTTCGATTTTAAAACCCCGGGG'),
      ('2', 0, 'GCAGTGACGTAGCGATGACGTAGACGCTTACG'),
  ])


def setUpModule():
  testdata.init()
コード例 #22
0
 def write_test_protos(self, filename):
   protos = [reference_pb2.ContigInfo(name=str(i)) for i in range(10)]
   path = test_utils.test_tmpfile(filename)
   io.write_tfrecords(protos, path)
   return protos, path
コード例 #23
0
class RangesTests(parameterized.TestCase):

  def test_ranges_overlaps(self):

    def check_overlaps(chr1, start1, end1, chr2, start2, end2, expected):
      i1 = ranges.make_range(chr1, start1, end1)
      i2 = ranges.make_range(chr2, start2, end2)
      self.assertEquals(ranges.ranges_overlap(i1, i2), expected)
      self.assertEquals(ranges.ranges_overlap(i2, i1), expected)

    check_overlaps('chr1', 0, 3, 'chr1', 4, 10, False)
    check_overlaps('chr1', 0, 3, 'chr1', 3, 10, False)
    check_overlaps('chr1', 0, 3, 'chr1', 2, 10, True)
    check_overlaps('chr1', 0, 3, 'chr1', 1, 10, True)
    check_overlaps('chr1', 0, 3, 'chr1', 0, 10, True)
    check_overlaps('chr1', 0, 3, 'chr1', 0, 1, True)
    check_overlaps('chr1', 0, 3, 'chr1', 0, 2, True)
    check_overlaps('chr1', 0, 3, 'chr1', 0, 3, True)
    check_overlaps('chr1', 0, 3, 'chr1', 1, 2, True)
    check_overlaps('chr1', 0, 3, 'chr1', 1, 3, True)
    check_overlaps('chr1', 0, 3, 'chr1', 2, 3, True)
    check_overlaps('chr1', 0, 3, 'chr1', 3, 3, False)
    check_overlaps('chr1', 1, 3, 'chr1', 0, 4, True)
    check_overlaps('chr1', 1, 3, 'chr1', 1, 4, True)

  def test_detector_no_ranges(self):
    range_set = ranges.RangeSet()
    # don't have any ranges by default
    self.assertEqual(bool(range_set), False)
    # make sure we can call overlaps without any ranges
    self.assertFalse(range_set.overlaps('chr1', 10))

  def test_from_regions_not_empty(self):
    literals = ['chr1', 'chr2:10-20']
    contig_map = {
        'chr1': reference_pb2.ContigInfo(name='chr1', n_bases=10),
        'chr2': reference_pb2.ContigInfo(name='chr2', n_bases=100),
    }
    self.assertItemsEqual(
        [ranges.make_range('chr1', 0, 10),
         ranges.make_range('chr2', 9, 20)],
        ranges.RangeSet.from_regions(literals, contig_map))

  def test_from_regions_empty_literals(self):
    range_set = ranges.RangeSet.from_regions([], contig_map=None)
    # The set is empty.
    self.assertItemsEqual([], range_set)
    self.assertFalse(range_set)

  @parameterized.parameters(
      # Overlapping intervals get merged.
      (['1:1-5', '1:3-8'], ['1:1-8']),
      (['1:1-5', '1:3-8', '1:6-9'], ['1:1-9']),
      # Adjacent intervals are merged.
      (['1:1-5', '1:5-8'], ['1:1-8']),
      (['1:1-5', '1:5-8', '1:8-10'], ['1:1-10']),
      # Sanity check that non-overlapping aren't merged.
      (['1:1-5', '1:6-8'], ['1:1-5', '1:6-8']),
  )
  def test_overlapping_and_adjacent_ranges_are_merged(self, regions, expected):
    self.assertCountEqual(
        ranges.RangeSet.from_regions(expected),
        ranges.RangeSet.from_regions(regions))

  def test_detector_ranges(self):
    test_ranges = [
        ranges.make_range('chr1', 0, 5),
        ranges.make_range('chr1', 8, 10),
        ranges.make_range('chr1', 12, 13),
        ranges.make_range('chr2', 2, 5),
    ]
    range_set = ranges.RangeSet(test_ranges)
    self.assertEqual(bool(range_set), True)
    self.assertEqual(len(range_set), 4)

    self.assertEqual(range_set.overlaps('chr1', 0), True)
    self.assertEqual(range_set.overlaps('chr1', 1), True)
    self.assertEqual(range_set.overlaps('chr1', 2), True)
    self.assertEqual(range_set.overlaps('chr1', 3), True)
    self.assertEqual(range_set.overlaps('chr1', 4), True)
    self.assertEqual(range_set.overlaps('chr1', 5), False)
    self.assertEqual(range_set.overlaps('chr1', 6), False)
    self.assertEqual(range_set.overlaps('chr1', 7), False)
    self.assertEqual(range_set.overlaps('chr1', 8), True)
    self.assertEqual(range_set.overlaps('chr1', 9), True)
    self.assertEqual(range_set.overlaps('chr1', 10), False)
    self.assertEqual(range_set.overlaps('chr1', 11), False)
    self.assertEqual(range_set.overlaps('chr1', 12), True)
    self.assertEqual(range_set.overlaps('chr1', 13), False)
    self.assertEqual(range_set.overlaps('chr1', 100), False)
    self.assertEqual(range_set.overlaps('chr1', 1000), False)
    self.assertEqual(range_set.overlaps('chr2', 0), False)
    self.assertEqual(range_set.overlaps('chr2', 1), False)
    self.assertEqual(range_set.overlaps('chr2', 2), True)
    self.assertEqual(range_set.overlaps('chr2', 3), True)
    self.assertEqual(range_set.overlaps('chr2', 4), True)
    self.assertEqual(range_set.overlaps('chr2', 5), False)
    self.assertEqual(range_set.overlaps('chr2', 6), False)
    self.assertEqual(range_set.overlaps('chr3', 3), False)

  def test_overlaps_variant_with_ranges(self):
    variant = variants_pb2.Variant(reference_name='chr2', start=10, end=11)
    range_set = ranges.RangeSet([ranges.make_range('chr1', 0, 5)])
    with mock.patch.object(range_set, 'overlaps') as mock_overlaps:
      mock_overlaps.return_value = True
      self.assertEqual(range_set.variant_overlaps(variant), True)
      mock_overlaps.assert_called_once_with('chr2', 10)

  def test_overlaps_variant_empty_range(self):
    variant = variants_pb2.Variant(reference_name='chr2', start=10, end=11)
    empty_set = ranges.RangeSet()
    self.assertEqual(
        empty_set.variant_overlaps(variant, empty_set_return_value='foo'),
        'foo')

  @parameterized.parameters(
      (ranges.make_range('1', 10, 50), '1', 9, False),
      (ranges.make_range('1', 10, 50), '1', 10, True),
      (ranges.make_range('1', 10, 50), '2', 10, False),
      (ranges.make_range('1', 10, 50), '1', 30, True),
      (ranges.make_range('1', 10, 50), '2', 30, False),
      (ranges.make_range('1', 10, 50), '1', 49, True),
      (ranges.make_range('1', 10, 50), '1', 50, False),
      (ranges.make_range('1', 10, 50), '1', 51, False),
  )
  def test_position_overlaps(self, interval, chrom, pos, expected):
    self.assertEqual(ranges.position_overlaps(chrom, pos, interval), expected)

  def test_make_position(self):
    self.assertEqual(
        ranges.make_position('chr1', 10),
        position_pb2.Position(
            reference_name='chr1', position=10, reverse_strand=False))
    self.assertEqual(
        ranges.make_position('chr2', 100, reverse_strand=True),
        position_pb2.Position(
            reference_name='chr2', position=100, reverse_strand=True))

  def test_make_range(self):
    interval = ranges.make_range('chr1', 1, 10)
    self.assertEqual(interval.reference_name, 'chr1')
    self.assertEqual(interval.start, 1)
    self.assertEqual(interval.end, 10)

  def test_to_literal(self):
    self.assertEqual(
        ranges.to_literal(ranges.make_range('chr1', 0, 20)), 'chr1:1-20')

  @parameterized.parameters(['chr1', '1', 'MT', 'chrM', 'chrX', 'X', 'Y'])
  def test_parse_literal_chromosomes(self, chrom):
    self.assertEqual(
        ranges.parse_literal(chrom + ':1-20'), ranges.make_range(chrom, 0, 20))

  @parameterized.parameters(
      ('chr1:{}-{}'.format(start_str, end_str), start_val, end_val)
      for start_str, start_val in [('12', 11), ('1,234', 1233)]
      for end_str, end_val in [('56789', 56789), ('56,789', 56789)])
  def test_parse_literal_numerics(self, literal, start_val, end_val):
    self.assertEqual(
        ranges.parse_literal(literal),
        ranges.make_range('chr1', start_val, end_val))

  def test_parse_literal_one_bp(self):
    self.assertEqual(
        ranges.parse_literal('1:10'), ranges.make_range('1', 9, 10))
    self.assertEqual(
        ranges.parse_literal('1:100'), ranges.make_range('1', 99, 100))
    self.assertEqual(
        ranges.parse_literal('1:1,000'), ranges.make_range('1', 999, 1000))

  @parameterized.parameters(['x', 'chr1', 'chr1:', 'chr1:10-', 'chr1:-1-10'])
  def test_parse_literal_bad(self, bad_literal):
    with self.assertRaises(ValueError):
      ranges.parse_literal(bad_literal)

  @parameterized.parameters('test.bed', 'test.bed.gz')
  def test_from_bed(self, bed_filename):
    source = test_utils.genomics_core_testdata(bed_filename)
    self.assertCountEqual([
        ranges.make_range('chr1', 1, 10),
        ranges.make_range('chr2', 20, 30),
        ranges.make_range('chr2', 40, 60),
        ranges.make_range('chr3', 80, 90),
    ], ranges.RangeSet.from_bed(source))

  @parameterized.parameters(
      dict(regions=[], expected=[]),
      dict(regions=['chr1:10-20'], expected=[ranges.make_range('chr1', 9, 20)]),
      dict(regions=['test.bed'], expected=_TEST_BED_REGIONS),
      dict(
          regions=['test.bed', 'test.bed'],
          expected=_TEST_BED_REGIONS + _TEST_BED_REGIONS),
      dict(
          regions=['chr1:10-20', 'test.bed'],
          expected=[ranges.make_range('chr1', 9, 20)] + _TEST_BED_REGIONS),
      dict(
          regions=['test.bed', 'chr1:10-20'],
          expected=_TEST_BED_REGIONS + [ranges.make_range('chr1', 9, 20)]),
      dict(
          regions=['chr1:9-19', 'test.bed', 'chr1:10-20'],
          expected=([ranges.make_range('chr1', 8, 19)] + _TEST_BED_REGIONS +
                    [ranges.make_range('chr1', 9, 20)])),
  )
  def test_from_regions(self, regions, expected):
    # For convenience we allow 'test.bed' in our regions but the actual file
    # path is in our testdata directory.
    for i in range(len(regions)):
      if regions[i] == 'test.bed':
        regions[i] = test_utils.genomics_core_testdata('test.bed')

    self.assertEqual(list(ranges.from_regions(regions)), expected)

  @parameterized.parameters(
      # Intersection with 1, 2, 3 identical RangeSets produces the original set.
      ([['1:1-10']], ['1:1-10']),
      ([['1:1-10'], ['1:1-10']], ['1:1-10']),
      ([['1:1-10'], ['1:1-10'], ['1:1-10']], ['1:1-10']),
      # Test some simple overlap configurations.
      ([['1:1-10'], ['1:11-15']], []),
      ([['1:1-10'], ['1:10-15']], ['1:10']),
      ([['1:1-10'], ['1:9-15']], ['1:9-10']),
      ([['1:5-10'], ['1:1-15']], ['1:5-10']),
      ([['1:5-10'], ['1:1-4']], []),
      ([['1:5-10'], ['1:1-5']], ['1:5']),
      # Check cutting a single interval into multiple pieces.
      ([['1:5-15'], ['1:6-8', '1:10-12']], ['1:6-8', '1:10-12']),
      ([['1:5-15'], ['1:3-8', '1:10-12']], ['1:5-8', '1:10-12']),
      ([['1:5-15'], ['1:3-8', '1:10-20']], ['1:5-8', '1:10-15']),
      # We have multiple overlapping intervals; make sure we merge intervals.
      ([['1:5-15'], ['1:3-8', '1:6-10']], ['1:5-10']),
      ([['1:5-15'], ['1:3-8', '1:6-10', '1:13']], ['1:5-10', '1:13']),
      # Check that multiple intervals work.
      ([['1:5-15', '1:20-25'], ['1:3-8', '1:16-23']], ['1:5-8', '1:20-23']),
      ([['1:5-15', '1:20-25'], ['1:3-8', '1:50-60']], ['1:5-8']),
      ([['1:5-15', '1:20-25'], ['1:3-4', '1:16-23']], ['1:20-23']),
      # Check that multiple sets can be intersected.
      ([['1:10-20'], ['1:5-15']], ['1:10-15']),
      ([['1:10-20'], ['1:5-15'], ['1:13-30']], ['1:13-15']),
      ([['1:10-20'], ['1:5-15'], ['1:25-30']], []),
      # Check that different chromosomes are kept separate.
      ([['1:10-20'], ['2:10-20']], []),
      ([['1:10-20', '2:11-14'], ['1:11-14']], ['1:11-14']),
      ([['1:10-20', '2:11-14'], ['2:10-20']], ['2:11-14']),
  )
  def test_intersection(self, regions, expected):
    regions_list = [ranges.RangeSet.from_regions(r) for r in regions]
    copies = [ranges.RangeSet(rs) for rs in regions_list]

    # Check that the intersection is as expected.
    self.assertCountEqual(
        ranges.RangeSet.from_regions(expected),
        regions_list[0].intersection(*regions_list[1:]))

    # Check that no one was modified.
    for pre, post in zip(copies, regions_list):
      self.assertCountEqual(pre, post)

  @parameterized.parameters(
      dict(lhs=['1:1-100'], rhs=['1:10-20'], expected=['1:1-9', '1:21-100']),
      dict(lhs=['1:1-100'], rhs=[], expected=['1:1-100']),
      dict(lhs=['1:1-100', '2:1-10'], rhs=['2:1-100'], expected=['1:1-100']),
      dict(
          lhs=['1:1-100'],
          rhs=['1:10-20', '1:15-30'],
          expected=['1:1-9', '1:31-100']),
      dict(
          lhs=['1:1-100'],
          rhs=['1:10-20', '1:30-40'],
          expected=['1:1-9', '1:21-29', '1:41-100']),
      # Excluding regions not in lhs has no impact.
      dict(lhs=['1:1-100'], rhs=['2:1-100'], expected=['1:1-100']),
      # Check that excluding the whole region results in an empty RangeSet.
      dict(lhs=['1:1-100'], rhs=['1:1-100'], expected=[]),
      # An empty tree remains empty.
      dict(lhs=[], rhs=['1:1-100'], expected=[]),
  )
  def test_exclude_regions(self, lhs, rhs, expected):
    lhs = ranges.RangeSet.from_regions(lhs)
    rhs = ranges.RangeSet.from_regions(rhs)
    # Mutating operation returns None.
    self.assertIsNone(lhs.exclude_regions(rhs))
    self.assertCountEqual(ranges.RangeSet.from_regions(expected), lhs)

  @parameterized.parameters(('chr1', ranges.make_range('chr1', 0, 10)),
                            ('chr2', ranges.make_range('chr2', 0, 5)))
  def test_parse_literal_with_contig_map(self, contig_name, expected):
    contig_map = {
        'chr1': reference_pb2.ContigInfo(name='chr1', n_bases=10),
        'chr2': reference_pb2.ContigInfo(name='chr2', n_bases=5),
    }
    self.assertEqual(
        ranges.parse_literal(contig_name, contig_map=contig_map), expected)

  @parameterized.parameters(['x', 'chr1:', 'chr1:10-', 'chr1:-1-10'])
  def test_parse_literal_with_contig_map_and_bad_input_raises_exception(
      self, bad_literal):
    with self.assertRaises(ValueError):
      ranges.parse_literal(
          bad_literal,
          contig_map={
              'chr1': reference_pb2.ContigInfo(name='chr1', n_bases=10)
          })

  def test_from_contigs(self):
    contigs = [
        reference_pb2.ContigInfo(name='chr1', n_bases=10),
        reference_pb2.ContigInfo(name='chr2', n_bases=5),
    ]
    self.assertCountEqual([
        ranges.make_range('chr1', 0, 10),
        ranges.make_range('chr2', 0, 5),
    ], ranges.RangeSet.from_contigs(contigs))

  @parameterized.parameters(
      # Chop our contigs into 50 bp pieces.
      (50, [('chrM', 0, 50), ('chrM', 50, 100), ('chr1', 0, 50),
            ('chr1', 50, 76), ('chr2', 0, 50), ('chr2', 50, 100),
            ('chr2', 100, 121)]),
      # Chop our contigs in 120 bp pieces, leaving a 1 bp fragment in chr2.
      (120, [('chrM', 0, 100), ('chr1', 0, 76), ('chr2', 0, 120),
             ('chr2', 120, 121)]),
      # A 500 max size spans each of our contigs fully.
      (500, [('chrM', 0, 100), ('chr1', 0, 76), ('chr2', 0, 121)]),
  )
  def test_partitions(self, interval_size, expected):
    rangeset = ranges.RangeSet([
        ranges.make_range('chrM', 0, 100),
        ranges.make_range('chr1', 0, 76),
        ranges.make_range('chr2', 0, 121),
    ])
    self.assertCountEqual([ranges.make_range(*args) for args in expected],
                          rangeset.partition(interval_size))

  def test_partitions_bad_interval_size_raises(self):
    # list() is necessary to force the generator to execute.
    with self.assertRaisesRegexp(ValueError, 'max_size'):
      list(ranges.RangeSet([ranges.make_range('chrM', 0, 100)]).partition(-10))
    with self.assertRaisesRegexp(ValueError, 'max_size'):
      list(ranges.RangeSet([ranges.make_range('chrM', 0, 100)]).partition(0))

  @parameterized.parameters(
      (10, [('1', 0, 10), ('1', 20, 30), ('1', 30, 40), ('1', 45, 50)]),
      (7, [('1', 0, 7), ('1', 7, 10), ('1', 20, 27), ('1', 27, 34),
           ('1', 34, 40), ('1', 45, 50)]),
      (50, [('1', 0, 10), ('1', 20, 40), ('1', 45, 50)]),
  )
  def test_partition_of_multiple_intervals(self, interval_size, expected):
    rangeset = ranges.RangeSet([
        ranges.make_range('1', 0, 10),
        ranges.make_range('1', 20, 40),
        ranges.make_range('1', 45, 50),
    ])
    self.assertCountEqual([ranges.make_range(*args) for args in expected],
                          rangeset.partition(interval_size))

  def test_unknown_filetype(self):
    with self.assertRaises(ValueError):
      ranges.parse_lines([], file_format='png')

  def test_bed_parser(self):
    data = [
        'chr20\t61724611\t61725646',
        'chr20\t61304163\t61305182',
        'chr20\t61286467\t61286789',
    ]
    self.assertEqual(
        list(ranges.parse_lines(data, 'bed')), [
            ranges.make_range('chr20', 61724611, 61725646),
            ranges.make_range('chr20', 61304163, 61305182),
            ranges.make_range('chr20', 61286467, 61286789),
        ])

  def test_bedpe_parser(self):
    # pylint: disable=line-too-long
    data = [
        'chr20\t25763416\t25765517\tchr20\t25825181\t25826882\tP2_PM_20_1549\t63266\t+\tTYPE:DELETION',
        'chr20\t25972820\t25972991\tchr20\t26045347\t26045538\tP2_PM_20_696\t72548\t+\tTYPE:DELETION',
        'chr20\t23719873\t23721974\tchr20\t23794822\t23796523\tP2_PM_20_1548\t76450\t+\tTYPE:DELETION',
    ]
    self.assertEqual(
        list(ranges.parse_lines(data, 'bedpe')), [
            ranges.make_range('chr20', 25763416, 25826882),
            ranges.make_range('chr20', 25972820, 26045538),
            ranges.make_range('chr20', 23719873, 23796523),
        ])

  def test_bedpe_parser_skips_cross_chr_events(self):
    # pylint: disable=line-too-long
    data = [
        'chr20\t25763416\t25765517\tchr21\t25825181\t25826882\tP2_PM_20_1549\t63266\t+\tTYPE:DELETION',
        'chr20\t25972820\t25972991\tchr20\t26045347\t26045538\tP2_PM_20_696\t72548\t+\tTYPE:DELETION',
        'chr20\t23719873\t23721974\tchr20\t23794822\t23796523\tP2_PM_20_1548\t76450\t+\tTYPE:DELETION',
    ]
    self.assertEqual(
        list(ranges.parse_lines(data, 'bedpe')), [
            ranges.make_range('chr20', 25972820, 26045538),
            ranges.make_range('chr20', 23719873, 23796523),
        ])

  def test_contigs_n_bases(self):
    c1 = reference_pb2.ContigInfo(name='c', n_bases=100, pos_in_fasta=0)
    c2 = reference_pb2.ContigInfo(name='a', n_bases=50, pos_in_fasta=1)
    c3 = reference_pb2.ContigInfo(name='b', n_bases=25, pos_in_fasta=2)
    self.assertEqual(100, ranges.contigs_n_bases([c1]))
    self.assertEqual(50, ranges.contigs_n_bases([c2]))
    self.assertEqual(25, ranges.contigs_n_bases([c3]))
    self.assertEqual(150, ranges.contigs_n_bases([c1, c2]))
    self.assertEqual(125, ranges.contigs_n_bases([c1, c3]))
    self.assertEqual(175, ranges.contigs_n_bases([c1, c2, c3]))

  def test_sort_ranges(self):
    contigs = [
        reference_pb2.ContigInfo(name='c', n_bases=100, pos_in_fasta=0),
        reference_pb2.ContigInfo(name='a', n_bases=76, pos_in_fasta=1),
        reference_pb2.ContigInfo(name='b', n_bases=121, pos_in_fasta=2),
    ]
    unsorted = ranges.parse_literals(
        ['a:10', 'c:20', 'b:30', 'b:10-15', 'b:10', 'a:5'])

    # Without contigs we sort the contigs by name lexicographically.
    self.assertEqual(
        ranges.parse_literals(
            ['a:5', 'a:10', 'b:10', 'b:10-15', 'b:30', 'c:20']),
        ranges.sorted_ranges(unsorted))

    # With contigs we sort by the position of the contigs themselves.
    self.assertEqual(
        ranges.parse_literals(
            ['c:20', 'a:5', 'a:10', 'b:10', 'b:10-15', 'b:30']),
        ranges.sorted_ranges(unsorted, contigs))

  @parameterized.parameters(
      (ranges.make_range('1', 0, 10), ranges.make_range('2', 0, 10), 0),
      (ranges.make_range('1', 0, 10), ranges.make_range('1', 10, 20), 0),
      (ranges.make_range('1', 0, 10), ranges.make_range('1', 100, 200), 0),
      (ranges.make_range('1', 10, 10), ranges.make_range('1', 0, 20), 0),
      (ranges.make_range('1', 0, 100), ranges.make_range('1', 50, 99), 49),
      # Check that the overlap handles a few key edge cases.
      (ranges.make_range('1', 0, 10), ranges.make_range('1', 0, 1), 1),
      (ranges.make_range('1', 0, 10), ranges.make_range('1', 0, 2), 2),
      (ranges.make_range('1', 1, 10), ranges.make_range('1', 0, 1), 0),
  )
  def test_overlap_len(self, region_1, region_2, expected_overlap):
    """Test ReadAssigner.overlap_len()."""
    self.assertEqual(expected_overlap, ranges.overlap_len(region_1, region_2))
    self.assertEqual(expected_overlap, ranges.overlap_len(region_2, region_1))

  @parameterized.parameters(
      # No search_regions produces None.
      dict(
          query_range=ranges.make_range('1', 20, 30),
          search_ranges=[],
          expected=None),

      # Read overlaps with none of the ranges returns None.
      dict(
          query_range=ranges.make_range('1', 20, 30),
          search_ranges=[
              ranges.make_range('1', 0, 10),
              ranges.make_range('1', 5, 10)
          ],
          expected=None),

      # Read has longer overlap with the first range.
      dict(
          query_range=ranges.make_range('1', 4, 10),
          search_ranges=[
              ranges.make_range('1', 0, 10),
              ranges.make_range('1', 5, 10)
          ],
          expected=0),

      # Read has longer overlap with the second range.
      dict(
          query_range=ranges.make_range('1', 9, 20),
          search_ranges=[
              ranges.make_range('1', 0, 10),
              ranges.make_range('1', 5, 15)
          ],
          expected=1),

      # Read has the maximum overlap with the third range.
      dict(
          query_range=ranges.make_range('1', 9, 20),
          search_ranges=[
              ranges.make_range('1', 0, 10),
              ranges.make_range('1', 0, 15),
              ranges.make_range('1', 5, 20)
          ],
          expected=2),

      # Read has the maximum overlap with the middle range.
      dict(
          query_range=ranges.make_range('1', 5, 13),
          search_ranges=[
              ranges.make_range('1', 0, 10),
              ranges.make_range('1', 0, 15),
              ranges.make_range('1', 10, 20)
          ],
          expected=1),

      # Read has a different reference_name with other ranges.
      dict(
          query_range=ranges.make_range('2', 0, 10),
          search_ranges=[
              ranges.make_range('1', 0, 10),
              ranges.make_range('2', 5, 15),
              ranges.make_range('3', 0, 10)
          ],
          expected=1),

      # Read has equal overlap in two ranges.
      dict(
          query_range=ranges.make_range('1', 5, 15),
          search_ranges=[
              ranges.make_range('1', 0, 10),
              ranges.make_range('1', 10, 20),
              ranges.make_range('1', 12, 20)
          ],
          expected=0),
  )
  def test_find_max_overlapping(self, query_range, search_ranges, expected):
    actual = ranges.find_max_overlapping(query_range, search_ranges)
    self.assertEqual(expected, actual)

  def test_find_max_overlapping_allows_unordered_search_ranges(self):
    query_range = ranges.make_range('1', 4, 12)
    search_ranges = [
        ranges.make_range('1', 0, 10),
        ranges.make_range('1', 10, 20),
        ranges.make_range('1', 12, 20)
    ]
    max_overlapping_range = search_ranges[0]

    for permutated_ranges in itertools.permutations(search_ranges):
      self.assertEqual(
          permutated_ranges.index(max_overlapping_range),
          ranges.find_max_overlapping(query_range, permutated_ranges))

  def test_find_max_overlapping_returns_least_index(self):
    query_range = ranges.make_range('1', 0, 10)
    search_ranges = [
        ranges.make_range('1', 0, 5),
        ranges.make_range('1', 5, 10)
    ]

    for to_search in [search_ranges, list(reversed(search_ranges))]:
      self.assertEqual(0, ranges.find_max_overlapping(query_range, to_search))

  @parameterized.parameters(
      dict(
          regions=[
              ranges.make_range('1', 1, 10),
          ],
          expected_span=ranges.make_range('1', 1, 10),
      ),
      dict(
          regions=[
              ranges.make_range('1', 1, 10),
              ranges.make_range('1', 10, 100),
          ],
          expected_span=ranges.make_range('1', 1, 100),
      ),
      dict(
          regions=[
              ranges.make_range('1', 1, 10),
              ranges.make_range('1', 10, 100),
              ranges.make_range('1', 2, 20),
          ],
          expected_span=ranges.make_range('1', 1, 100),
      ),
      # potential edge cases:
      # same start, different ends.
      dict(
          regions=[
              ranges.make_range('1', 1, 10),
              ranges.make_range('1', 1, 100),
          ],
          expected_span=ranges.make_range('1', 1, 100),
      ),
      # same end, different starts.
      dict(
          regions=[
              ranges.make_range('1', 1, 10),
              ranges.make_range('1', 2, 10),
          ],
          expected_span=ranges.make_range('1', 1, 10),
      ),
  )
  def test_span_computes_span_correctly(self, regions, expected_span):
    for permutation in itertools.permutations(regions, len(regions)):
      self.assertEqual(expected_span, ranges.span(permutation))

  @parameterized.parameters(
      dict(regions=[], regexp='empty'),
      dict(
          regions=[
              ranges.make_range('1', 0, 2),
              ranges.make_range('2', 0, 2),
          ],
          regexp='regions must be all on the same contig'),
      dict(
          regions=[
              ranges.make_range('1', 0, 2),
              ranges.make_range('1', 0, 3),
              ranges.make_range('2', 0, 2),
          ],
          regexp='regions must be all on the same contig'),
  )
  def test_span_raises_on_bad_input(self, regions, regexp):
    with self.assertRaisesRegexp(ValueError, regexp):
      ranges.span(regions)

  @parameterized.parameters(
      dict(
          region=ranges.make_range('1', 10, 20),
          n_bp=n_bp,
          contig_map=None,
          expected=ranges.make_range('1', 10 - n_bp, 20 + n_bp),
      ) for n_bp in range(10))
  def test_expand_is_correct(self, region, n_bp, contig_map, expected):
    self.assertEqual(expected, ranges.expand(region, n_bp, contig_map))

  @parameterized.parameters(
      # Check that we don't create Ranges with negative starts.
      dict(
          region=ranges.make_range('1', 10, 20),
          n_bp=20,
          contig_map=None,
          expected=ranges.make_range('1', 0, 40),
      ),
      # Check that we respect n_bp if contig_map is provided.
      dict(
          region=ranges.make_range('1', 10, 20),
          n_bp=40,
          contig_map={
              '1': reference_pb2.ContigInfo(name='1', n_bases=50),
          },
          expected=ranges.make_range('1', 0, 50),
      ),
  )
  def test_expand_handles_boundaries(self, region, n_bp, contig_map, expected):
    self.assertEqual(expected, ranges.expand(region, n_bp, contig_map))

  def test_expand_raises_on_negative_n_bp(self):
    with self.assertRaisesRegexp(ValueError, 'n_bp must be >= 0 but got -10'):
      ranges.expand(ranges.make_range('1', 10, 20), -10)

  def test_expand_raises_with_missing_contig_in_map(self):
    # Empty contig_map should raise.
    with self.assertRaises(KeyError):
      ranges.expand(ranges.make_range('1', 10, 20), 1, contig_map={})

    # Missing '1' from the contig map should raise.
    with self.assertRaises(KeyError):
      ranges.expand(
          ranges.make_range('1', 10, 20),
          1,
          contig_map={
              '2': reference_pb2.ContigInfo(name='2', n_bases=50),
          })
コード例 #24
0
ファイル: ranges_test.py プロジェクト: palc/deepvariant
from third_party.nucleus.protos import position_pb2
from third_party.nucleus.protos import reference_pb2
from third_party.nucleus.protos import variants_pb2
from third_party.nucleus.testing import test_utils
from third_party.nucleus.util import ranges

_TEST_BED_REGIONS = [
    ranges.make_range('chr1', 1, 10),
    ranges.make_range('chr2', 20, 30),
    ranges.make_range('chr2', 40, 60),
    ranges.make_range('chr3', 80, 90),
]

_TEST_CONTIGS = [
    reference_pb2.ContigInfo(name='chr1', n_bases=10, pos_in_fasta=0),
    reference_pb2.ContigInfo(name='chr2', n_bases=100, pos_in_fasta=1),
    reference_pb2.ContigInfo(name='chr3', n_bases=500, pos_in_fasta=2),
]


class RangesTests(parameterized.TestCase):
    def test_ranges_overlaps(self):
        def check_overlaps(chr1, start1, end1, chr2, start2, end2, expected):
            i1 = ranges.make_range(chr1, start1, end1)
            i2 = ranges.make_range(chr2, start2, end2)
            self.assertEqual(ranges.ranges_overlap(i1, i2), expected)
            self.assertEqual(ranges.ranges_overlap(i2, i1), expected)

        check_overlaps('chr1', 0, 3, 'chr1', 4, 10, False)
        check_overlaps('chr1', 0, 3, 'chr1', 3, 10, False)