コード例 #1
0
    def test_wrap(self, fasta_filename):
        chr_names = ['chrM', 'chr1', 'chr2']
        chr_lengths = [100, 76, 121]
        fasta = test_utils.genomics_core_testdata(fasta_filename)
        fai = test_utils.genomics_core_testdata(fasta_filename + '.fai')
        with reference_fai.GenomeReferenceFai.from_file(fasta, fai) as ref:
            self.assertEqual(ref.n_contigs, 3)
            self.assertIn(fasta, ref.fasta_path)
            self.assertIn('GenomeReference backed by htslib FAI index',
                          str(ref))
            self.assertEqual(ref.contig_names, chr_names)
            self.assertEqual(ref.n_bp, sum(chr_lengths))
            self.assertEqual(ref.bases(ranges.make_range('chrM', 1, 10)),
                             'ATCACAGGT')

            self.assertTrue(
                ref.is_valid_interval(ranges.make_range('chrM', 1, 10)))
            self.assertFalse(
                ref.is_valid_interval(ranges.make_range('chrM', 1, 100000)))

            self.assertEqual(len(ref.contigs), 3)
            self.assertEqual([c.name for c in ref.contigs], chr_names)
            self.assertEqual([c.n_bases for c in ref.contigs], chr_lengths)
            for contig in ref.contigs:
                self.assertEqual(ref.contig(contig.name), contig)
                self.assertTrue(ref.has_contig(contig.name))
                self.assertFalse(ref.has_contig(contig.name + '.unknown'))
コード例 #2
0
 def setUp(self):
     self.sites_reader = genomics_io.make_vcf_reader(
         test_utils.genomics_core_testdata('test_sites.vcf'),
         use_index=False)
     self.samples_reader = genomics_io.make_vcf_reader(
         test_utils.genomics_core_testdata('test_samples.vcf.gz'),
         use_index=True)
コード例 #3
0
 def test_from_file_raises_with_missing_inputs(self, fasta_filename,
                                               fai_filename):
     fasta = test_utils.genomics_core_testdata(fasta_filename)
     fai = test_utils.genomics_core_testdata(fai_filename)
     with self.assertRaisesRegexp(
             ValueError,
             'Not found: could not load fasta and/or fai for fasta ' +
             fasta):
         reference_fai.GenomeReferenceFai.from_file(fasta, fai)
コード例 #4
0
 def setUp(self):
   self.unindexed_options = core_pb2.VcfReaderOptions()
   self.indexed_options = core_pb2.VcfReaderOptions(
       index_mode=core_pb2.INDEX_BASED_ON_FILENAME)
   self.sites_vcf = test_utils.genomics_core_testdata('test_sites.vcf')
   self.samples_vcf = test_utils.genomics_core_testdata('test_samples.vcf.gz')
   self.sites_reader = vcf_reader.VcfReader.from_file(self.sites_vcf,
                                                      self.unindexed_options)
   self.samples_reader = vcf_reader.VcfReader.from_file(
       self.samples_vcf, self.indexed_options)
コード例 #5
0
  def test_round_trip_vcf(self, test_datum_name):
    # Round-trip variants through writing and reading:
    # 1. Read variants v1 from VcfReader;
    # 2. Write v1 to vcf using our VcfWriter;
    # 3. Read back in using VcfReader -- v2;
    # 4. compare v1 and v2.
    in_file = test_utils.genomics_core_testdata(test_datum_name)
    out_file = test_utils.test_tmpfile('output_' + test_datum_name)

    v1_reader = genomics_io.make_vcf_reader(in_file, use_index=False)
    v1_records = list(v1_reader.iterate())
    self.assertTrue(v1_records, 'Reader failed to find records')

    writer_options = core_pb2.VcfWriterOptions(
        contigs=v1_reader.contigs,
        sample_names=v1_reader.samples,
        filters=v1_reader.filters)

    with vcf_writer.VcfWriter.to_file(out_file, writer_options) as writer:
      for record in v1_records:
        writer.write(record)

    v2_reader = genomics_io.make_vcf_reader(out_file, use_index=False)
    v2_records = list(v2_reader.iterate())

    self.assertEqual(v1_records, v2_records,
                     'Round-tripped variants not as expected')
コード例 #6
0
 def test_sam_iterate_raises_on_malformed_record(self):
     malformed = test_utils.genomics_core_testdata('malformed.sam')
     reader = sam_reader.SamReader.from_file(malformed, self.options)
     iterable = iter(reader.iterate())
     self.assertIsNotNone(next(iterable))
     with self.assertRaises(ValueError):
         list(iterable)
コード例 #7
0
  def test_from_regions(self, regions, expected):
    # For convenience we allow 'test.bed' in our regions but the actual file
    # path is in our testdata directory.
    for i in range(len(regions)):
      if regions[i] == 'test.bed':
        regions[i] = test_utils.genomics_core_testdata('test.bed')

    self.assertEqual(list(ranges.from_regions(regions)), expected)
コード例 #8
0
 def test_from_bed(self):
   source = test_utils.genomics_core_testdata('test.bed')
   self.assertCountEqual([
       ranges.make_range('chr1', 1, 10),
       ranges.make_range('chr2', 20, 30),
       ranges.make_range('chr2', 40, 60),
       ranges.make_range('chr3', 80, 90),
   ], ranges.RangeSet.from_bed(source))
コード例 #9
0
 def test_sam_query(self):
   reader = genomics_io.make_sam_reader(
       test_utils.genomics_core_testdata('test.bam'))
   expected = [(ranges.parse_literal('chr20:10,000,000-10,000,100'), 106),
               (ranges.parse_literal('chr20:10,000,000-10,000,000'), 45)]
   with reader:
     for interval, n_expected in expected:
       with reader.query(interval) as iterable:
         self.assertEqual(test_utils.iterable_len(iterable), n_expected)
コード例 #10
0
 def test_bam_iterate_partially(self):
   """Verify that iteration provides results incrementally, not all at once."""
   reader = genomics_io.make_sam_reader(
       test_utils.genomics_core_testdata('test.bam'), use_index=False)
   with reader:
     iterable = reader.iterate()
     # We expect 106 records in total.
     for _ in xrange(10):
       results = list(itertools.islice(iterable, 10))
       self.assertEqual(len(results), 10)
     results = list(itertools.islice(iterable, 10))
     self.assertEqual(len(results), 6)
コード例 #11
0
 def test_downsampling(self, method, maybe_range, fraction, expected_n_reads):
   reader = genomics_io.make_sam_reader(
       test_utils.genomics_core_testdata('test.bam'),
       downsample_fraction=fraction,
       random_seed=12345)
   with reader:
     if method == 'iterate':
       reads_iter = reader.iterate()
     elif method == 'query':
       reads_iter = reader.query(ranges.parse_literal(maybe_range))
     else:
       self.fail('Unexpected method', method)
     self.assertEqual(test_utils.iterable_len(reads_iter), expected_n_reads)
コード例 #12
0
 def setUp(self):
     self.bam = test_utils.genomics_core_testdata('test.bam')
     self.options = core_pb2.SamReaderOptions()
     self.indexed_options = core_pb2.SamReaderOptions(
         index_mode=core_pb2.INDEX_BASED_ON_FILENAME)
コード例 #13
0
  def test_writing_canned_variants(self):
    """Tests writing all the variants that are 'canned' in our tfrecord file."""

    # This file is in the TF record format
    tfrecord_file = test_utils.genomics_core_testdata(
        'test_samples.vcf.golden.tfrecord')

    writer_options = core_pb2.VcfWriterOptions(
        contigs=[
            core_pb2.ContigInfo(name='chr1', n_bases=248956422),
            core_pb2.ContigInfo(name='chr2', n_bases=242193529),
            core_pb2.ContigInfo(name='chr3', n_bases=198295559),
            core_pb2.ContigInfo(name='chrX', n_bases=156040895)
        ],
        sample_names=['NA12878_18_99'],
        filters=[
            core_pb2.VcfFilterInfo(id='LowQual'),
            core_pb2.VcfFilterInfo(id='VQSRTrancheINDEL95.00to96.00'),
            core_pb2.VcfFilterInfo(id='VQSRTrancheINDEL96.00to97.00'),
            core_pb2.VcfFilterInfo(id='VQSRTrancheINDEL97.00to99.00'),
            core_pb2.VcfFilterInfo(id='VQSRTrancheINDEL99.00to99.50'),
            core_pb2.VcfFilterInfo(id='VQSRTrancheINDEL99.50to99.90'),
            core_pb2.VcfFilterInfo(id='VQSRTrancheINDEL99.90to99.95'),
            core_pb2.VcfFilterInfo(id='VQSRTrancheINDEL99.95to100.00+'),
            core_pb2.VcfFilterInfo(id='VQSRTrancheINDEL99.95to100.00'),
            core_pb2.VcfFilterInfo(id='VQSRTrancheSNP99.50to99.60'),
            core_pb2.VcfFilterInfo(id='VQSRTrancheSNP99.60to99.80'),
            core_pb2.VcfFilterInfo(id='VQSRTrancheSNP99.80to99.90'),
            core_pb2.VcfFilterInfo(id='VQSRTrancheSNP99.90to99.95'),
            core_pb2.VcfFilterInfo(id='VQSRTrancheSNP99.95to100.00+'),
            core_pb2.VcfFilterInfo(id='VQSRTrancheSNP99.95to100.00'),
        ])

    variant_records = list(
        io_utils.read_tfrecords(tfrecord_file, proto=variants_pb2.Variant))
    out_fname = test_utils.test_tmpfile('output.vcf')
    with vcf_writer.VcfWriter.to_file(out_fname, writer_options) as writer:
      for record in variant_records[:5]:
        writer.write(record)

    # Check: are the variants written as expected?
    # pylint: disable=line-too-long
    expected_vcf_content = [
        '##fileformat=VCFv4.2\n',
        '##FILTER=<ID=PASS,Description="All filters passed">\n',
        '##FILTER=<ID=LowQual,Description="">\n',
        '##FILTER=<ID=VQSRTrancheINDEL95.00to96.00,Description="">\n',
        '##FILTER=<ID=VQSRTrancheINDEL96.00to97.00,Description="">\n',
        '##FILTER=<ID=VQSRTrancheINDEL97.00to99.00,Description="">\n',
        '##FILTER=<ID=VQSRTrancheINDEL99.00to99.50,Description="">\n',
        '##FILTER=<ID=VQSRTrancheINDEL99.50to99.90,Description="">\n',
        '##FILTER=<ID=VQSRTrancheINDEL99.90to99.95,Description="">\n',
        '##FILTER=<ID=VQSRTrancheINDEL99.95to100.00+,Description="">\n',
        '##FILTER=<ID=VQSRTrancheINDEL99.95to100.00,Description="">\n',
        '##FILTER=<ID=VQSRTrancheSNP99.50to99.60,Description="">\n',
        '##FILTER=<ID=VQSRTrancheSNP99.60to99.80,Description="">\n',
        '##FILTER=<ID=VQSRTrancheSNP99.80to99.90,Description="">\n',
        '##FILTER=<ID=VQSRTrancheSNP99.90to99.95,Description="">\n',
        '##FILTER=<ID=VQSRTrancheSNP99.95to100.00+,Description="">\n',
        '##FILTER=<ID=VQSRTrancheSNP99.95to100.00,Description="">\n',
        '##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n',
        '##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality">\n',
        '##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Read depth of all '
        'passing filters reads.">\n',
        '##FORMAT=<ID=AD,Number=R,Type=Integer,Description="Read depth of all '
        'passing filters reads for each allele.">\n',
        '##FORMAT=<ID=VAF,Number=A,Type=Float,Description=\"Variant allele '
        'fractions.">\n',
        '##FORMAT=<ID=GL,Number=G,Type=Float,Description="Genotype '
        'likelihoods, log10 encoded">\n',
        '##FORMAT=<ID=PL,Number=G,Type=Integer,Description="Genotype '
        'likelihoods, Phred encoded">\n',
        '##INFO=<ID=END,Number=1,Type=Integer,Description="Stop position of '
        'the interval">\n', '##contig=<ID=chr1,length=248956422>\n',
        '##contig=<ID=chr2,length=242193529>\n',
        '##contig=<ID=chr3,length=198295559>\n',
        '##contig=<ID=chrX,length=156040895>\n',
        '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tNA12878_18_99\n',
        'chr1\t13613\t.\tT\tA\t39.88\tVQSRTrancheSNP99.90to99.95\t.\tGT:GQ:DP:AD:PL\t0/1:16:4:1,3:68,0,16\n',
        'chr1\t13813\t.\tT\tG\t90.28\tPASS\t.\tGT:GQ:DP:AD:PL\t1/1:9:3:0,3:118,9,0\n',
        'chr1\t13838\trs28428499\tC\tT\t62.74\tPASS\t.\tGT:GQ:DP:AD:PL\t1/1:6:2:0,2:90,6,0\n',
        'chr1\t14397\trs756427959\tCTGT\tC\t37.73\tPASS\t.\tGT:GQ:DP:AD:PL\t0/1:75:5:3,2:75,0,152\n',
        'chr1\t14522\t.\tG\tA\t49.77\tVQSRTrancheSNP99.60to99.80\t.\tGT:GQ:DP:AD:PL\t0/1:78:10:6,4:78,0,118\n'
    ]
    # pylint: enable=line-too-long

    with tf.gfile.GFile(out_fname, 'r') as f:
      self.assertEqual(f.readlines(), expected_vcf_content)
コード例 #14
0
 def test_make_ref_reader(self, fasta_filename):
   fasta_path = test_utils.genomics_core_testdata(fasta_filename)
   with genomics_io.make_ref_reader(fasta_path) as reader:
     self.assertEqual(reader.bases(ranges.make_range('chrM', 1, 6)), 'ATCAC')
コード例 #15
0
 def test_from_file_raises_with_missing_index(self):
   with self.assertRaisesRegexp(ValueError, 'Not found: No index found for'):
     vcf_reader.VcfReader.from_file(
         test_utils.genomics_core_testdata('test_sites.vcf'),
         self.indexed_options)
コード例 #16
0
 def test_bam_iterate(self):
   reader = genomics_io.make_sam_reader(
       test_utils.genomics_core_testdata('test.bam'), use_index=False)
   with reader:
     self.assertEqual(test_utils.iterable_len(reader.iterate()), 106)
コード例 #17
0
 def test_from_file_raises_with_missing_index(self):
     with self.assertRaisesRegexp(ValueError,
                                  'Not found: No index found for'):
         sam_reader.SamReader.from_file(
             test_utils.genomics_core_testdata('unindexed.bam'),
             self.indexed_options)