Example #1
0
  def setUp(self):
    self.sites_reader = vcf.VcfReader(
        test_utils.genomics_core_testdata('test_sites.vcf'), use_index=False)

    self.samples_reader = vcf.VcfReader(
        test_utils.genomics_core_testdata('test_samples.vcf.gz'),
        use_index=True)
    def test_wrap(self, fasta_filename):
        chr_names = ['chrM', 'chr1', 'chr2']
        chr_lengths = [100, 76, 121]
        fasta = test_utils.genomics_core_testdata(fasta_filename)
        fai = test_utils.genomics_core_testdata(fasta_filename + '.fai')
        with reference_fai.GenomeReferenceFai.from_file(fasta, fai) as ref:
            self.assertEqual(ref.n_contigs, 3)
            self.assertIn(fasta, ref.fasta_path)
            self.assertIn('GenomeReference backed by htslib FAI index',
                          str(ref))
            self.assertEqual(ref.contig_names, chr_names)
            self.assertEqual(ref.n_bp, sum(chr_lengths))
            self.assertEqual(ref.bases(ranges.make_range('chrM', 1, 10)),
                             'ATCACAGGT')

            self.assertTrue(
                ref.is_valid_interval(ranges.make_range('chrM', 1, 10)))
            self.assertFalse(
                ref.is_valid_interval(ranges.make_range('chrM', 1, 100000)))

            self.assertEqual(len(ref.contigs), 3)
            self.assertEqual([c.name for c in ref.contigs], chr_names)
            self.assertEqual([c.n_bases for c in ref.contigs], chr_lengths)
            for contig in ref.contigs:
                self.assertEqual(ref.contig(contig.name), contig)
                self.assertTrue(ref.has_contig(contig.name))
                self.assertFalse(ref.has_contig(contig.name + '.unknown'))
 def test_from_file_raises_with_missing_inputs(self, fasta_filename,
                                               fai_filename):
   fasta = test_utils.genomics_core_testdata(fasta_filename)
   fai = test_utils.genomics_core_testdata(fai_filename)
   with self.assertRaisesRegexp(
       ValueError,
       'Not found: could not load fasta and/or fai for fasta ' + fasta):
     reference_fai.GenomeReferenceFai.from_file(fasta, fai)
 def setUp(self):
   self.sites_vcf = test_utils.genomics_core_testdata('test_sites.vcf')
   self.samples_vcf = test_utils.genomics_core_testdata('test_samples.vcf.gz')
   self.options = variants_pb2.VcfReaderOptions()
   self.sites_reader = vcf_reader.VcfReader.from_file(self.sites_vcf,
                                                      self.options)
   self.samples_reader = vcf_reader.VcfReader.from_file(
       self.samples_vcf, self.options)
 def test_from_file_raises_with_missing_inputs(self, fasta_filename,
                                               fai_filename):
   fasta = test_utils.genomics_core_testdata(fasta_filename)
   fai = test_utils.genomics_core_testdata(fai_filename)
   with self.assertRaisesRegexp(
       ValueError,
       'Not found: could not load fasta and/or fai for fasta ' + fasta):
     reference.IndexedFastaReader.from_file(fasta, fai)
Example #6
0
    def setUp(self):
        self.sites_reader = vcf.VcfReader(
            test_utils.genomics_core_testdata('test_sites.vcf'),
            use_index=False)

        self.samples_reader = vcf.VcfReader(
            test_utils.genomics_core_testdata('test_samples.vcf.gz'),
            use_index=True)
Example #7
0
 def test_iterate(self, fasta_filename):
     # Check the indexed fasta file's iterable matches that of the unindexed
     # fasta file.
     indexed_fasta_reader = fasta.IndexedFastaReader(
         test_utils.genomics_core_testdata(fasta_filename))
     unindexed_fasta_reader = fasta.UnindexedFastaReader(
         test_utils.genomics_core_testdata(fasta_filename))
     self.assertEqual(list(indexed_fasta_reader.iterate()),
                      list(unindexed_fasta_reader.iterate()))
Example #8
0
 def setUp(self):
   self.unindexed_options = variants_pb2.VcfReaderOptions()
   self.indexed_options = variants_pb2.VcfReaderOptions(
       index_mode=index_pb2.INDEX_BASED_ON_FILENAME)
   self.sites_vcf = test_utils.genomics_core_testdata('test_sites.vcf')
   self.samples_vcf = test_utils.genomics_core_testdata('test_samples.vcf.gz')
   self.sites_reader = vcf_reader.VcfReader.from_file(self.sites_vcf,
                                                      self.unindexed_options)
   self.samples_reader = vcf_reader.VcfReader.from_file(
       self.samples_vcf, self.indexed_options)
Example #9
0
 def _make_reader(self, filename, has_embedded_ref):
     if has_embedded_ref:
         # If we have an embedded reference, force the reader to use it by not
         # providing an argument for ref_path.
         return sam.SamReader(test_utils.genomics_core_testdata(filename))
     else:
         # Otherwise we need to explicitly override the reference encoded in the UR
         # of the CRAM file to use the path provided to our test.fasta.
         return sam.SamReader(
             test_utils.genomics_core_testdata(filename),
             ref_path=test_utils.genomics_core_testdata('test.fasta'))
Example #10
0
 def test_dispatching_reader(self):
   with fasta.FastaReader(
       test_utils.genomics_core_testdata('test.fasta')) as reader:
     # The reader is an instance of IndexedFastaReader which supports query().
     self.assertEqual(reader.query(ranges.make_range('chrM', 1, 6)), 'ATCAC')
   with fasta.FastaReader(
       test_utils.genomics_core_testdata('unindexed.fasta')) as reader:
     # The reader is an instance of UnindexedFastaReader which doesn't support
     # query().
     with self.assertRaises(NotImplementedError):
       reader.query(ranges.make_range('chrM', 1, 5))
 def test_sam_iterate_raises_on_malformed_record(self):
   malformed = test_utils.genomics_core_testdata('malformed.sam')
   reader = sam_reader.SamReader.from_file(malformed, self.options)
   iterable = iter(reader.iterate())
   self.assertIsNotNone(next(iterable))
   with self.assertRaises(ValueError):
     list(iterable)
Example #12
0
  def test_c_reader(self):
    self.assertNotEqual(self.sites_reader.c_reader, 0)
    self.assertNotEqual(self.samples_reader.c_reader, 0)

    tfrecord_reader = vcf.VcfReader(
        test_utils.genomics_core_testdata('test_samples.vcf.golden.tfrecord'))
    self.assertNotEqual(tfrecord_reader.c_reader, 0)
Example #13
0
 def setUp(self):
   super(TabixTest, self).setUp()
   self.input_file = test_utils.genomics_core_testdata('test_samples.vcf.gz')
   self.output_file = test_utils.test_tmpfile('test_samples.vcf.gz')
   shutil.copyfile(self.input_file, self.output_file)
   self.tbx_index_file = self.output_file + '.tbi'
   self.csi_index_file = self.output_file + '.csi'
Example #14
0
  def test_roundtrip(self,
                     expected_infos,
                     expected_fmt,
                     expected_fmt1,
                     expected_fmt2,
                     reader_excluded_info=None,
                     reader_excluded_format=None,
                     writer_excluded_info=None,
                     writer_excluded_format=None):
    expected_records = [
        record.format(info=info, fmt=expected_fmt, efmts1=e1,
                      efmts2=e2) for record, info, e1, e2 in zip(
                          self.record_format_strings, expected_infos,
                          expected_fmt1, expected_fmt2)
    ]
    expected = self.header + ''.join(expected_records)
    with vcf.VcfReader(
        test_utils.genomics_core_testdata('test_py_roundtrip.vcf'),
        use_index=False,
        excluded_info_fields=reader_excluded_info,
        excluded_format_fields=reader_excluded_format) as reader:

      records = list(reader.iterate())
      output_path = test_utils.test_tmpfile('test_roundtrip_tmpfile.vcf')
      with vcf.VcfWriter(
          output_path,
          header=reader.header,
          excluded_info_fields=writer_excluded_info,
          excluded_format_fields=writer_excluded_format) as writer:
        for record in records:
          writer.write(record)

    with open(output_path) as f:
      actual = f.read()
    self.assertEqual(actual, expected)
  def test_round_trip_vcf(self, test_datum_name):
    # Round-trip variants through writing and reading:
    # 1. Read variants v1 from VcfReader;
    # 2. Write v1 to vcf using our VcfWriter;
    # 3. Read back in using VcfReader -- v2;
    # 4. compare v1 and v2.
    in_file = test_utils.genomics_core_testdata(test_datum_name)
    out_file = test_utils.test_tmpfile('output_' + test_datum_name)

    v1_reader = vcf.VcfReader(in_file, use_index=False)
    v1_records = list(v1_reader.iterate())
    self.assertTrue(v1_records, 'Reader failed to find records')

    header = copy.deepcopy(v1_reader.header)
    writer_options = variants_pb2.VcfWriterOptions()

    with vcf_writer.VcfWriter.to_file(out_file, header,
                                      writer_options) as writer:
      for record in v1_records:
        writer.write(record)

    v2_reader = vcf.VcfReader(out_file, use_index=False)
    v2_records = list(v2_reader.iterate())

    self.assertEqual(v1_records, v2_records,
                     'Round-tripped variants not as expected')
 def test_sam_iterate_raises_on_malformed_record(self):
     malformed = test_utils.genomics_core_testdata('malformed.sam')
     reader = sam_reader.SamReader.from_file(malformed, self.options)
     iterable = iter(reader.iterate())
     self.assertIsNotNone(next(iterable))
     with self.assertRaises(ValueError):
         list(iterable)
Example #17
0
 def test_native_gff_header(self, gff_filename):
     gff_path = test_utils.genomics_core_testdata(gff_filename)
     with gff.GffReader(gff_path) as reader:
         self.assertEqual(EXPECTED_GFF_VERSION, reader.header.gff_version)
     with gff.NativeGffReader(gff_path) as native_reader:
         self.assertEqual(EXPECTED_GFF_VERSION,
                          native_reader.header.gff_version)
Example #18
0
    def test_round_trip_vcf(self, test_datum_name):
        # Round-trip variants through writing and reading:
        # 1. Read variants v1 from VcfReader;
        # 2. Write v1 to vcf using our VcfWriter;
        # 3. Read back in using VcfReader -- v2;
        # 4. compare v1 and v2.
        in_file = test_utils.genomics_core_testdata(test_datum_name)
        out_file = test_utils.test_tmpfile('output_' + test_datum_name)

        v1_reader = vcf.VcfReader(in_file)
        v1_records = list(v1_reader.iterate())
        self.assertTrue(v1_records, 'Reader failed to find records')

        header = copy.deepcopy(v1_reader.header)
        writer_options = variants_pb2.VcfWriterOptions()

        with vcf_writer.VcfWriter.to_file(out_file, header,
                                          writer_options) as writer:
            for record in v1_records:
                writer.write(record)

        v2_reader = vcf.VcfReader(out_file)
        v2_records = list(v2_reader.iterate())

        self.assertEqual(v1_records, v2_records,
                         'Round-tripped variants not as expected')
 def test_headless_sam_raises(self):
     headerless = test_utils.genomics_core_testdata('headerless.sam')
     with self.assertRaisesRegex(
             ValueError, 'Could not parse file with bad SAM header'):
         sam_reader.SamReader.from_file(reads_path=headerless,
                                        ref_path='',
                                        options=self.options)
Example #20
0
 def setUp(self):
     tfrecord_file = test_utils.genomics_core_testdata(
         'test_features.gff.tfrecord')
     self.records = list(
         io_utils.read_tfrecords(tfrecord_file, proto=gff_pb2.GffRecord))
     self.header = gff_pb2.GffHeader(
         sequence_regions=[ranges.make_range('ctg123', 0, 1497228)])
Example #21
0
 def test_query_without_index_raises(self, unindexed_file_name):
   path = test_utils.genomics_core_testdata(unindexed_file_name)
   window = ranges.parse_literal('chr20:10,000,000-10,000,100')
   with sam_reader.SamReader.from_file(
       reads_path=path, ref_path='', options=self.options) as reader:
     with self.assertRaisesRegex(ValueError, 'Cannot query without an index'):
       reader.query(window)
Example #22
0
    def test_roundtrip(self,
                       expected_infos,
                       expected_fmt,
                       expected_fmt1,
                       expected_fmt2,
                       reader_excluded_info=None,
                       reader_excluded_format=None,
                       writer_excluded_info=None,
                       writer_excluded_format=None):
        expected_records = [
            record.format(info=info, fmt=expected_fmt, efmts1=e1,
                          efmts2=e2) for record, info, e1, e2 in zip(
                              self.record_format_strings, expected_infos,
                              expected_fmt1, expected_fmt2)
        ]
        expected = self.header + ''.join(expected_records)
        with vcf.VcfReader(
                test_utils.genomics_core_testdata('test_py_roundtrip.vcf'),
                excluded_info_fields=reader_excluded_info,
                excluded_format_fields=reader_excluded_format) as reader:

            records = list(reader.iterate())
            output_path = test_utils.test_tmpfile('test_roundtrip_tmpfile.vcf')
            with vcf.VcfWriter(
                    output_path,
                    header=reader.header,
                    excluded_info_fields=writer_excluded_info,
                    excluded_format_fields=writer_excluded_format) as writer:
                for record in records:
                    writer.write(record)

        with open(output_path) as f:
            actual = f.read()
        self.assertEqual(actual, expected)
Example #23
0
 def test_bed_iterate_raises_on_malformed_record(self, filename):
     malformed = test_utils.genomics_core_testdata(filename)
     reader = bed_reader.BedReader.from_file(malformed, self.options)
     iterable = iter(reader.iterate())
     self.assertIsNotNone(next(iterable))
     with self.assertRaises(ValueError):
         list(iterable)
 def test_bed_iterate_raises_on_malformed_record(self, filename):
   malformed = test_utils.genomics_core_testdata(filename)
   reader = bed_reader.BedReader.from_file(malformed, self.options)
   iterable = iter(reader.iterate())
   self.assertIsNotNone(next(iterable))
   with self.assertRaises(ValueError):
     list(iterable)
    def test_conversion_to_tfrecord_and_back(self, original_input_file):
        """Test conversion from a native file format to tfrecord.gz, then back."""
        input_path = test_utils.genomics_core_testdata(original_input_file)
        tfrecord_output_path = test_utils.test_tmpfile(original_input_file +
                                                       ".tfrecord.gz")
        native_output_path = test_utils.test_tmpfile(original_input_file)

        # Test conversion from native format to tfrecord.
        self._convert(input_path, tfrecord_output_path)

        # redacted
        if native_output_path.endswith(".sam"):
            raise unittest.SkipTest("SAM writing not yet supported")

        # Test conversion from tfrecord format back to native format.  Ensure that
        # conversions where we would need a header, but don't have one from the
        # input, trigger an error message.
        if any(
                native_output_path.endswith(ext)
                for ext in FORMATS_REQUIRING_HEADER):
            with self.assertRaisesRegexp(
                    converter.ConversionError,
                    "Input file does not have a header, which is needed to construct "
                    "output file"):
                self._convert(tfrecord_output_path, native_output_path)

        else:
            self._convert(tfrecord_output_path, native_output_path)
Example #26
0
  def test_from_regions(self, regions, expected):
    # For convenience we allow 'test.bed' in our regions but the actual file
    # path is in our testdata directory.
    for i in range(len(regions)):
      if regions[i] == 'test.bed':
        regions[i] = test_utils.genomics_core_testdata('test.bed')

    self.assertEqual(list(ranges.from_regions(regions)), expected)
Example #27
0
 def test_ops_on_closed_reader_raise(self):
   file_path = test_utils.genomics_core_testdata('test_features.gff')
   reader = gff_reader.GffReader.from_file(file_path, self.options)
   with reader:
     pass
   # At this point the reader is closed.
   with self.assertRaisesRegexp(ValueError, 'Cannot Iterate a closed'):
     reader.iterate()
Example #28
0
 def test_query_on_unindexed_reader_raises(self):
     window = ranges.parse_literal('chr1:10,000,000-10,000,100')
     unindexed_file = test_utils.genomics_core_testdata('test_samples.vcf')
     with vcf_reader.VcfReader.from_file(unindexed_file,
                                         self.options) as reader:
         with self.assertRaisesRegexp(ValueError,
                                      'Cannot query without an index'):
             reader.query(window)
 def test_headless_sam_raises(self):
     headerless = test_utils.genomics_core_testdata('headerless.sam')
     reader = sam_reader.SamReader.from_file(reads_path=headerless,
                                             ref_path='',
                                             options=self.options)
     iterable = iter(reader.iterate())
     with self.assertRaises(ValueError):
         next(iterable)
Example #30
0
 def test_from_bed(self, bed_filename):
   source = test_utils.genomics_core_testdata(bed_filename)
   self.assertCountEqual([
       ranges.make_range('chr1', 1, 10),
       ranges.make_range('chr2', 20, 30),
       ranges.make_range('chr2', 40, 60),
       ranges.make_range('chr3', 80, 90),
   ], ranges.RangeSet.from_bed(source))
Example #31
0
 def setUp(self):
     self.bed = test_utils.genomics_core_testdata('test_regions.bed')
     self.zipped_bed = test_utils.genomics_core_testdata(
         'test_regions.bed.gz')
     self.options = bed_pb2.BedReaderOptions()
     self.first = bed_pb2.BedRecord(reference_name='chr1',
                                    start=10,
                                    end=20,
                                    name='first',
                                    score=100,
                                    strand=bed_pb2.BedRecord.FORWARD_STRAND,
                                    thick_start=12,
                                    thick_end=18,
                                    item_rgb='255,124,1',
                                    block_count=3,
                                    block_sizes='2,6,2',
                                    block_starts='10,12,18')
Example #32
0
 def testCompressed(self):
     reader = genomics_reader.TFRecordReader(
         test_utils.genomics_core_testdata('test_features.gff.tfrecord.gz'),
         gff_pb2.GffRecord(),
     )
     records = list(reader.iterate())
     self.assertEqual('GenBank', records[0].source)
     self.assertEqual('ctg123', records[1].range.reference_name)
Example #33
0
 def test_sam_query(self):
   reader = sam.SamReader(test_utils.genomics_core_testdata('test.bam'))
   expected = [(ranges.parse_literal('chr20:10,000,000-10,000,100'), 106),
               (ranges.parse_literal('chr20:10,000,000-10,000,000'), 45)]
   with reader:
     for interval, n_expected in expected:
       with reader.query(interval) as iterable:
         self.assertEqual(test_utils.iterable_len(iterable), n_expected)
 def setUp(self):
   self.bed = test_utils.genomics_core_testdata('test_regions.bed')
   self.zipped_bed = test_utils.genomics_core_testdata('test_regions.bed.gz')
   self.options = bed_pb2.BedReaderOptions()
   self.first = bed_pb2.BedRecord(
       reference_name='chr1',
       start=10,
       end=20,
       name='first',
       score=100,
       strand=bed_pb2.BedRecord.FORWARD_STRAND,
       thick_start=12,
       thick_end=18,
       item_rgb='255,124,1',
       block_count=3,
       block_sizes='2,6,2',
       block_starts='10,12,18')
Example #35
0
 def test_iterate_bed_reader(self, bed_filename):
     bed_path = test_utils.genomics_core_testdata(bed_filename)
     expected = [('chr1', 10, 20), ('chr1', 100, 200)]
     with bed.BedReader(bed_path) as reader:
         records = list(reader.iterate())
     self.assertLen(records, 2)
     self.assertEqual([(r.reference_name, r.start, r.end) for r in records],
                      expected)
Example #36
0
 def test_gff_iterate(self, test_features_gff_filename):
   file_path = test_utils.genomics_core_testdata(test_features_gff_filename)
   with gff_reader.GffReader.from_file(file_path, self.options) as reader:
     iterable = reader.iterate()
     self.assertIsInstance(iterable, clif_postproc.WrappedCppIterable)
     actual = list(iterable)
     self.assertLen(actual, 2)
     self.assertEqual(actual[0], self.first)
     self.assertEqual(actual[1], self.second)
Example #37
0
 def test_sam_query(self):
   reader = sam.SamReader(
       test_utils.genomics_core_testdata('test.bam'))
   expected = [(ranges.parse_literal('chr20:10,000,000-10,000,100'), 106),
               (ranges.parse_literal('chr20:10,000,000-10,000,000'), 45)]
   with reader:
     for interval, n_expected in expected:
       with reader.query(interval) as iterable:
         self.assertEqual(test_utils.iterable_len(iterable), n_expected)
Example #38
0
 def test_iterate_fastq_reader(self, fastq_filename):
   fastq_path = test_utils.genomics_core_testdata(fastq_filename)
   expected_ids = [
       'NODESC:header', 'M01321:49:000000000-A6HWP:1:1101:17009:2216', 'FASTQ'
   ]
   with fastq.FastqReader(fastq_path) as reader:
     records = list(reader.iterate())
   self.assertLen(records, 3)
   self.assertEqual([r.id for r in records], expected_ids)
Example #39
0
    def test_iterate_gff_reader(self, gff_filename):
        gff_path = test_utils.genomics_core_testdata(gff_filename)
        expected = [('ctg123', 999, 9000), ('ctg123', 999, 1012)]

        with gff.GffReader(gff_path) as reader:
            records = list(reader.iterate())
        self.assertLen(records, 2)
        self.assertEqual([(r.range.reference_name, r.range.start, r.range.end)
                          for r in records], expected)
Example #40
0
  def setUpClass(cls):
    cls.fasta_reader = fasta.RefFastaReader(
        test_utils.genomics_core_testdata('test.fasta'))

    cls.in_mem = fasta.InMemoryRefReader(
        [(contig.name, 0,
          cls.fasta_reader.query(
              ranges.make_range(contig.name, 0, contig.n_bases)))
         for contig in cls.fasta_reader.header.contigs])
Example #41
0
 def test_bam_iterate_partially(self):
   """Verify that iteration provides results incrementally, not all at once."""
   reader = sam.SamReader(
       test_utils.genomics_core_testdata('test.bam'), use_index=False)
   with reader:
     iterable = reader.iterate()
     # We expect 106 records in total.
     for _ in xrange(10):
       results = list(itertools.islice(iterable, 10))
       self.assertEqual(len(results), 10)
     results = list(itertools.islice(iterable, 10))
     self.assertEqual(len(results), 6)
Example #42
0
 def test_downsampling(self, method, maybe_range, fraction, expected_n_reads):
   reader = sam.SamReader(
       test_utils.genomics_core_testdata('test.bam'),
       downsample_fraction=fraction,
       random_seed=12345)
   with reader:
     if method == 'iterate':
       reads_iter = reader.iterate()
     elif method == 'query':
       reads_iter = reader.query(ranges.parse_literal(maybe_range))
     else:
       self.fail('Unexpected method', method)
     self.assertEqual(test_utils.iterable_len(reads_iter), expected_n_reads)
Example #43
0
  def test_wrap(self, fasta_filename):
    chr_names = ['chrM', 'chr1', 'chr2']
    chr_lengths = [100, 76, 121]
    fasta = test_utils.genomics_core_testdata(fasta_filename)
    fai = test_utils.genomics_core_testdata(fasta_filename + '.fai')
    with reference_fai.GenomeReferenceFai.from_file(fasta, fai) as ref:
      self.assertEqual(ref.n_contigs, 3)
      self.assertIn(fasta, ref.fasta_path)
      self.assertIn('GenomeReference backed by htslib FAI index', str(ref))
      self.assertEqual(ref.contig_names, chr_names)
      self.assertEqual(ref.n_bp, sum(chr_lengths))
      self.assertEqual(ref.bases(ranges.make_range('chrM', 1, 10)), 'ATCACAGGT')

      self.assertTrue(ref.is_valid_interval(ranges.make_range('chrM', 1, 10)))
      self.assertFalse(
          ref.is_valid_interval(ranges.make_range('chrM', 1, 100000)))

      self.assertEqual(len(ref.contigs), 3)
      self.assertEqual([c.name for c in ref.contigs], chr_names)
      self.assertEqual([c.n_bases for c in ref.contigs], chr_lengths)
      for contig in ref.contigs:
        self.assertEqual(ref.contig(contig.name), contig)
        self.assertTrue(ref.has_contig(contig.name))
        self.assertFalse(ref.has_contig(contig.name + '.unknown'))
  def test_writing_canned_variants(self):
    """Tests writing all the variants that are 'canned' in our tfrecord file."""
    # This file is in the TF record format
    tfrecord_file = test_utils.genomics_core_testdata(
        'test_samples.vcf.golden.tfrecord')

    writer_options = variants_pb2.VcfWriterOptions()
    header = variants_pb2.VcfHeader(
        contigs=[
            reference_pb2.ContigInfo(name='chr1', n_bases=248956422),
            reference_pb2.ContigInfo(name='chr2', n_bases=242193529),
            reference_pb2.ContigInfo(name='chr3', n_bases=198295559),
            reference_pb2.ContigInfo(name='chrX', n_bases=156040895)
        ],
        sample_names=['NA12878_18_99'],
        filters=[
            variants_pb2.VcfFilterInfo(
                id='PASS', description='All filters passed'),
            variants_pb2.VcfFilterInfo(id='LowQual', description=''),
            variants_pb2.VcfFilterInfo(id='VQSRTrancheINDEL95.00to96.00'),
            variants_pb2.VcfFilterInfo(id='VQSRTrancheINDEL96.00to97.00'),
            variants_pb2.VcfFilterInfo(id='VQSRTrancheINDEL97.00to99.00'),
            variants_pb2.VcfFilterInfo(id='VQSRTrancheINDEL99.00to99.50'),
            variants_pb2.VcfFilterInfo(id='VQSRTrancheINDEL99.50to99.90'),
            variants_pb2.VcfFilterInfo(id='VQSRTrancheINDEL99.90to99.95'),
            variants_pb2.VcfFilterInfo(id='VQSRTrancheINDEL99.95to100.00+'),
            variants_pb2.VcfFilterInfo(id='VQSRTrancheINDEL99.95to100.00'),
            variants_pb2.VcfFilterInfo(id='VQSRTrancheSNP99.50to99.60'),
            variants_pb2.VcfFilterInfo(id='VQSRTrancheSNP99.60to99.80'),
            variants_pb2.VcfFilterInfo(id='VQSRTrancheSNP99.80to99.90'),
            variants_pb2.VcfFilterInfo(id='VQSRTrancheSNP99.90to99.95'),
            variants_pb2.VcfFilterInfo(id='VQSRTrancheSNP99.95to100.00+'),
            variants_pb2.VcfFilterInfo(id='VQSRTrancheSNP99.95to100.00'),
        ],
        infos=[
            variants_pb2.VcfInfo(
                id='END',
                number='1',
                type='Integer',
                description='Stop position of the interval')
        ],
        formats=[
            variants_pb2.VcfFormatInfo(
                id='GT', number='1', type='String', description='Genotype'),
            variants_pb2.VcfFormatInfo(
                id='GQ',
                number='1',
                type='Integer',
                description='Genotype Quality'),
            variants_pb2.VcfFormatInfo(
                id='DP',
                number='1',
                type='Integer',
                description='Read depth of all passing filters reads.'),
            variants_pb2.VcfFormatInfo(
                id='MIN_DP',
                number='1',
                type='Integer',
                description='Minimum DP observed within the GVCF block.'),
            variants_pb2.VcfFormatInfo(
                id='AD',
                number='R',
                type='Integer',
                description=
                'Read depth of all passing filters reads for each allele.'),
            variants_pb2.VcfFormatInfo(
                id='VAF',
                number='A',
                type='Float',
                description='Variant allele fractions.'),
            variants_pb2.VcfFormatInfo(
                id='GL',
                number='G',
                type='Float',
                description='Genotype likelihoods, log10 encoded'),
            variants_pb2.VcfFormatInfo(
                id='PL',
                number='G',
                type='Integer',
                description='Genotype likelihoods, Phred encoded'),
        ],
    )
    variant_records = list(
        io_utils.read_tfrecords(tfrecord_file, proto=variants_pb2.Variant))
    out_fname = test_utils.test_tmpfile('output.vcf')
    with vcf_writer.VcfWriter.to_file(out_fname, header,
                                      writer_options) as writer:
      for record in variant_records[:5]:
        writer.write(record)

    # Check: are the variants written as expected?
    # pylint: disable=line-too-long
    expected_vcf_content = [
        '##fileformat=VCFv4.2\n',
        '##FILTER=<ID=PASS,Description="All filters passed">\n',
        '##FILTER=<ID=LowQual,Description="">\n',
        '##FILTER=<ID=VQSRTrancheINDEL95.00to96.00,Description="">\n',
        '##FILTER=<ID=VQSRTrancheINDEL96.00to97.00,Description="">\n',
        '##FILTER=<ID=VQSRTrancheINDEL97.00to99.00,Description="">\n',
        '##FILTER=<ID=VQSRTrancheINDEL99.00to99.50,Description="">\n',
        '##FILTER=<ID=VQSRTrancheINDEL99.50to99.90,Description="">\n',
        '##FILTER=<ID=VQSRTrancheINDEL99.90to99.95,Description="">\n',
        '##FILTER=<ID=VQSRTrancheINDEL99.95to100.00+,Description="">\n',
        '##FILTER=<ID=VQSRTrancheINDEL99.95to100.00,Description="">\n',
        '##FILTER=<ID=VQSRTrancheSNP99.50to99.60,Description="">\n',
        '##FILTER=<ID=VQSRTrancheSNP99.60to99.80,Description="">\n',
        '##FILTER=<ID=VQSRTrancheSNP99.80to99.90,Description="">\n',
        '##FILTER=<ID=VQSRTrancheSNP99.90to99.95,Description="">\n',
        '##FILTER=<ID=VQSRTrancheSNP99.95to100.00+,Description="">\n',
        '##FILTER=<ID=VQSRTrancheSNP99.95to100.00,Description="">\n',
        '##INFO=<ID=END,Number=1,Type=Integer,Description="Stop position of '
        'the interval">\n',
        '##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n',
        '##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality">\n',
        '##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Read depth of all '
        'passing filters reads.">\n',
        '##FORMAT=<ID=MIN_DP,Number=1,Type=Integer,Description="Minimum DP '
        'observed within the GVCF block.">\n',
        '##FORMAT=<ID=AD,Number=R,Type=Integer,Description="Read depth of all '
        'passing filters reads for each allele.">\n',
        '##FORMAT=<ID=VAF,Number=A,Type=Float,Description=\"Variant allele '
        'fractions.">\n',
        '##FORMAT=<ID=GL,Number=G,Type=Float,Description="Genotype '
        'likelihoods, log10 encoded">\n',
        '##FORMAT=<ID=PL,Number=G,Type=Integer,Description="Genotype '
        'likelihoods, Phred encoded">\n',
        '##contig=<ID=chr1,length=248956422>\n',
        '##contig=<ID=chr2,length=242193529>\n',
        '##contig=<ID=chr3,length=198295559>\n',
        '##contig=<ID=chrX,length=156040895>\n',
        '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tNA12878_18_99\n',
        'chr1\t13613\t.\tT\tA\t39.88\tVQSRTrancheSNP99.90to99.95\t.\tGT:GQ:DP:AD:PL\t0/1:16:4:1,3:68,0,16\n',
        'chr1\t13813\t.\tT\tG\t90.28\tPASS\t.\tGT:GQ:DP:AD:PL\t1/1:9:3:0,3:118,9,0\n',
        'chr1\t13838\trs28428499\tC\tT\t62.74\tPASS\t.\tGT:GQ:DP:AD:PL\t1/1:6:2:0,2:90,6,0\n',
        'chr1\t14397\trs756427959\tCTGT\tC\t37.73\tPASS\t.\tGT:GQ:DP:AD:PL\t0/1:75:5:3,2:75,0,152\n',
        'chr1\t14522\t.\tG\tA\t49.77\tVQSRTrancheSNP99.60to99.80\t.\tGT:GQ:DP:AD:PL\t0/1:78:10:6,4:78,0,118\n'
    ]
    # pylint: enable=line-too-long

    with tf.gfile.GFile(out_fname, 'r') as f:
      self.assertEqual(f.readlines(), expected_vcf_content)
 def setUp(self):
   self.bam = test_utils.genomics_core_testdata('test.bam')
   self.options = reads_pb2.SamReaderOptions()
   self.indexed_options = reads_pb2.SamReaderOptions(
       index_mode=index_pb2.INDEX_BASED_ON_FILENAME)
Example #46
0
 def test_bam_iterate(self):
   reader = sam.SamReader(
       test_utils.genomics_core_testdata('test.bam'), use_index=False)
   with reader:
     self.assertEqual(test_utils.iterable_len(reader.iterate()), 106)
 def test_headless_sam_raises(self):
   headerless = test_utils.genomics_core_testdata('headerless.sam')
   reader = sam_reader.SamReader.from_file(headerless, self.options)
   iterable = iter(reader.iterate())
   with self.assertRaises(ValueError):
     next(iterable)
Example #48
0
 def test_make_ref_reader_default(self, fasta_filename):
   fasta_path = test_utils.genomics_core_testdata(fasta_filename)
   with fasta.RefFastaReader(fasta_path) as reader:
     self.assertEqual(reader.query(ranges.make_range('chrM', 1, 6)), 'ATCAC')
Example #49
0
 def test_make_ref_reader_cache_specified(self, fasta_filename):
   fasta_path = test_utils.genomics_core_testdata(fasta_filename)
   with fasta.RefFastaReader(fasta_path, cache_size=10) as reader:
     self.assertEqual(reader.query(ranges.make_range('chrM', 1, 5)), 'ATCA')
 def test_from_file_raises_with_missing_index(self):
   with self.assertRaisesRegexp(ValueError, 'Not found: No index found for'):
     sam_reader.SamReader.from_file(
         test_utils.genomics_core_testdata('unindexed.bam'),
         self.indexed_options)
 def test_from_file_raises_with_missing_index(self):
   with self.assertRaisesRegexp(ValueError, 'Not found: No index found for'):
     vcf_reader.VcfReader.from_file(
         test_utils.genomics_core_testdata('test_sites.vcf'),
         self.indexed_options)
Example #52
0
 def setUp(self):
   self.vcf_reader = vcf.VcfReader(
       test_utils.genomics_core_testdata('test_sites.vcf'), use_index=False)
   self.cache = self.vcf_reader.field_access_cache