Esempio n. 1
0
  def test_read_tfrecords_max_records(self, filename, max_records):
    protos, path = self.write_test_protos(filename)

    # Create our generator of records from read_tfrecords.
    if max_records is None:
      expected_n = len(protos)
    else:
      expected_n = min(max_records, len(protos))
    actual = tfrecord.read_tfrecords(
        path, reference_pb2.ContigInfo, max_records=max_records)
    self.assertLen(list(actual), expected_n)
Esempio n. 2
0
    def test_make_read_writer_tfrecords(self):
        outfile = test_utils.test_tmpfile('test.tfrecord')
        writer = sam.SamWriter(outfile, header=self.header)

        # Test that the writer is a context manager and that we can write a read to
        # it.
        with writer:
            writer.write(self.read1)
            writer.write(self.read2)

        # Our output should have exactly one read in it.
        self.assertEqual([self.read1, self.read2],
                         list(
                             tfrecord.read_tfrecords(outfile,
                                                     proto=reads_pb2.Read)))
Esempio n. 3
0
    def test_read_write_tfrecords(self, filename):
        protos, path = self.write_test_protos(filename)

        # Create our generator of records from read_tfrecords.
        reader = tfrecord.read_tfrecords(path, reference_pb2.ContigInfo)

        # Make sure it's actually a generator.
        self.assertEqual(type(reader), types.GeneratorType)

        # Check the round-trip contents.
        if '@' in filename:
            # Sharded outputs are striped across shards, so order isn't preserved.
            self.assertCountEqual(protos, reader)
        else:
            self.assertEqual(protos, list(reader))
Esempio n. 4
0
    def test_writing_canned_records(self):
        """Tests writing all the records that are 'canned' in our tfrecord file."""
        # This file is in TFRecord format.
        tfrecord_file = test_utils.genomics_core_testdata(
            'test_features.gff.tfrecord')
        writer_options = gff_pb2.GffWriterOptions()
        gff_records = list(
            tfrecord.read_tfrecords(tfrecord_file, proto=gff_pb2.GffRecord))
        out_fname = test_utils.test_tmpfile('output.gff')
        with gff_writer.GffWriter.to_file(out_fname, self.header,
                                          writer_options) as writer:
            for record in gff_records:
                writer.write(record)

        with open(out_fname) as f:
            self.assertEqual(f.readlines(), self.expected_gff_content)
Esempio n. 5
0
    def test_writing_canned_records(self):
        """Tests writing all the variants that are 'canned' in our tfrecord file."""
        # This file is in TFRecord format.
        tfrecord_file = test_utils.genomics_core_testdata(
            'test_reads.fastq.tfrecord')

        writer_options = fastq_pb2.FastqWriterOptions()
        fastq_records = list(
            tfrecord.read_tfrecords(tfrecord_file,
                                    proto=fastq_pb2.FastqRecord))
        out_fname = test_utils.test_tmpfile('output.fastq')
        with fastq_writer.FastqWriter.to_file(out_fname,
                                              writer_options) as writer:
            for record in fastq_records:
                writer.write(record)

        with gfile.GFile(out_fname, 'r') as f:
            self.assertEqual(f.readlines(), self.expected_fastq_content)
Esempio n. 6
0
    def test_writing_canned_records(self):
        """Tests writing all the records that are 'canned' in our tfrecord file."""
        # This file is in TFRecord format.
        tfrecord_file = test_utils.genomics_core_testdata(
            'test_regions.bed.tfrecord')

        header = bed_pb2.BedHeader(num_fields=12)
        writer_options = bed_pb2.BedWriterOptions()
        bed_records = list(
            tfrecord.read_tfrecords(tfrecord_file, proto=bed_pb2.BedRecord))
        out_fname = test_utils.test_tmpfile('output.bed')
        with bed_writer.BedWriter.to_file(out_fname, header,
                                          writer_options) as writer:
            for record in bed_records:
                writer.write(record)

        with gfile.Open(out_fname, 'r') as f:
            self.assertEqual(f.readlines(), self.expected_bed_content)
Esempio n. 7
0
    def test_writing_canned_variants(self):
        """Tests writing all the variants that are 'canned' in our tfrecord file."""
        # This file is in the TF record format
        tfrecord_file = test_utils.genomics_core_testdata(
            'test_samples.vcf.golden.tfrecord')

        writer_options = variants_pb2.VcfWriterOptions()
        header = variants_pb2.VcfHeader(
            contigs=[
                reference_pb2.ContigInfo(name='chr1', n_bases=248956422),
                reference_pb2.ContigInfo(name='chr2', n_bases=242193529),
                reference_pb2.ContigInfo(name='chr3', n_bases=198295559),
                reference_pb2.ContigInfo(name='chrX', n_bases=156040895)
            ],
            sample_names=['NA12878_18_99'],
            filters=[
                variants_pb2.VcfFilterInfo(id='PASS',
                                           description='All filters passed'),
                variants_pb2.VcfFilterInfo(id='LowQual', description=''),
                variants_pb2.VcfFilterInfo(id='VQSRTrancheINDEL95.00to96.00'),
                variants_pb2.VcfFilterInfo(id='VQSRTrancheINDEL96.00to97.00'),
                variants_pb2.VcfFilterInfo(id='VQSRTrancheINDEL97.00to99.00'),
                variants_pb2.VcfFilterInfo(id='VQSRTrancheINDEL99.00to99.50'),
                variants_pb2.VcfFilterInfo(id='VQSRTrancheINDEL99.50to99.90'),
                variants_pb2.VcfFilterInfo(id='VQSRTrancheINDEL99.90to99.95'),
                variants_pb2.VcfFilterInfo(
                    id='VQSRTrancheINDEL99.95to100.00+'),
                variants_pb2.VcfFilterInfo(id='VQSRTrancheINDEL99.95to100.00'),
                variants_pb2.VcfFilterInfo(id='VQSRTrancheSNP99.50to99.60'),
                variants_pb2.VcfFilterInfo(id='VQSRTrancheSNP99.60to99.80'),
                variants_pb2.VcfFilterInfo(id='VQSRTrancheSNP99.80to99.90'),
                variants_pb2.VcfFilterInfo(id='VQSRTrancheSNP99.90to99.95'),
                variants_pb2.VcfFilterInfo(id='VQSRTrancheSNP99.95to100.00+'),
                variants_pb2.VcfFilterInfo(id='VQSRTrancheSNP99.95to100.00'),
            ],
            infos=[
                variants_pb2.VcfInfo(
                    id='END',
                    number='1',
                    type='Integer',
                    description='Stop position of the interval')
            ],
            formats=[
                variants_pb2.VcfFormatInfo(id='GT',
                                           number='1',
                                           type='String',
                                           description='Genotype'),
                variants_pb2.VcfFormatInfo(id='GQ',
                                           number='1',
                                           type='Integer',
                                           description='Genotype Quality'),
                variants_pb2.VcfFormatInfo(
                    id='DP',
                    number='1',
                    type='Integer',
                    description='Read depth of all passing filters reads.'),
                variants_pb2.VcfFormatInfo(
                    id='MIN_DP',
                    number='1',
                    type='Integer',
                    description='Minimum DP observed within the GVCF block.'),
                variants_pb2.VcfFormatInfo(
                    id='AD',
                    number='R',
                    type='Integer',
                    description=
                    'Read depth of all passing filters reads for each allele.'
                ),
                variants_pb2.VcfFormatInfo(
                    id='VAF',
                    number='A',
                    type='Float',
                    description='Variant allele fractions.'),
                variants_pb2.VcfFormatInfo(
                    id='PL',
                    number='G',
                    type='Integer',
                    description='Genotype likelihoods, Phred encoded'),
            ],
        )
        variant_records = list(
            tfrecord.read_tfrecords(tfrecord_file, proto=variants_pb2.Variant))
        out_fname = test_utils.test_tmpfile('output.vcf')
        with vcf_writer.VcfWriter.to_file(out_fname, header,
                                          writer_options) as writer:
            for record in variant_records[:5]:
                writer.write(record)

        # Check: are the variants written as expected?
        # pylint: disable=line-too-long
        expected_vcf_content = [
            '##fileformat=VCFv4.2\n',
            '##FILTER=<ID=PASS,Description="All filters passed">\n',
            '##FILTER=<ID=LowQual,Description="">\n',
            '##FILTER=<ID=VQSRTrancheINDEL95.00to96.00,Description="">\n',
            '##FILTER=<ID=VQSRTrancheINDEL96.00to97.00,Description="">\n',
            '##FILTER=<ID=VQSRTrancheINDEL97.00to99.00,Description="">\n',
            '##FILTER=<ID=VQSRTrancheINDEL99.00to99.50,Description="">\n',
            '##FILTER=<ID=VQSRTrancheINDEL99.50to99.90,Description="">\n',
            '##FILTER=<ID=VQSRTrancheINDEL99.90to99.95,Description="">\n',
            '##FILTER=<ID=VQSRTrancheINDEL99.95to100.00+,Description="">\n',
            '##FILTER=<ID=VQSRTrancheINDEL99.95to100.00,Description="">\n',
            '##FILTER=<ID=VQSRTrancheSNP99.50to99.60,Description="">\n',
            '##FILTER=<ID=VQSRTrancheSNP99.60to99.80,Description="">\n',
            '##FILTER=<ID=VQSRTrancheSNP99.80to99.90,Description="">\n',
            '##FILTER=<ID=VQSRTrancheSNP99.90to99.95,Description="">\n',
            '##FILTER=<ID=VQSRTrancheSNP99.95to100.00+,Description="">\n',
            '##FILTER=<ID=VQSRTrancheSNP99.95to100.00,Description="">\n',
            '##INFO=<ID=END,Number=1,Type=Integer,Description="Stop position of '
            'the interval">\n',
            '##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n',
            '##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality">\n',
            '##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Read depth of all '
            'passing filters reads.">\n',
            '##FORMAT=<ID=MIN_DP,Number=1,Type=Integer,Description="Minimum DP '
            'observed within the GVCF block.">\n',
            '##FORMAT=<ID=AD,Number=R,Type=Integer,Description="Read depth of all '
            'passing filters reads for each allele.">\n',
            '##FORMAT=<ID=VAF,Number=A,Type=Float,Description=\"Variant allele '
            'fractions.">\n',
            '##FORMAT=<ID=PL,Number=G,Type=Integer,Description="Genotype '
            'likelihoods, Phred encoded">\n',
            '##contig=<ID=chr1,length=248956422>\n',
            '##contig=<ID=chr2,length=242193529>\n',
            '##contig=<ID=chr3,length=198295559>\n',
            '##contig=<ID=chrX,length=156040895>\n',
            '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tNA12878_18_99\n',
            'chr1\t13613\t.\tT\tA\t39.88\tVQSRTrancheSNP99.90to99.95\t.\tGT:GQ:DP:AD:PL\t0/1:16:4:1,3:68,0,16\n',
            'chr1\t13813\t.\tT\tG\t90.28\tPASS\t.\tGT:GQ:DP:AD:PL\t1/1:9:3:0,3:118,9,0\n',
            'chr1\t13838\trs28428499\tC\tT\t62.74\tPASS\t.\tGT:GQ:DP:AD:PL\t1/1:6:2:0,2:90,6,0\n',
            'chr1\t14397\trs756427959\tCTGT\tC\t37.73\tPASS\t.\tGT:GQ:DP:AD:PL\t0/1:75:5:3,2:75,0,152\n',
            'chr1\t14522\t.\tG\tA\t49.77\tVQSRTrancheSNP99.60to99.80\t.\tGT:GQ:DP:AD:PL\t0/1:78:10:6,4:78,0,118\n'
        ]
        # pylint: enable=line-too-long

        with gfile.GFile(out_fname, 'r') as f:
            self.assertEqual(f.readlines(), expected_vcf_content)