def test_read_tfrecords_max_records(self, filename, max_records): protos, path = self.write_test_protos(filename) # Create our generator of records from read_tfrecords. if max_records is None: expected_n = len(protos) else: expected_n = min(max_records, len(protos)) actual = tfrecord.read_tfrecords( path, reference_pb2.ContigInfo, max_records=max_records) self.assertLen(list(actual), expected_n)
def test_make_read_writer_tfrecords(self): outfile = test_utils.test_tmpfile('test.tfrecord') writer = sam.SamWriter(outfile, header=self.header) # Test that the writer is a context manager and that we can write a read to # it. with writer: writer.write(self.read1) writer.write(self.read2) # Our output should have exactly one read in it. self.assertEqual([self.read1, self.read2], list( tfrecord.read_tfrecords(outfile, proto=reads_pb2.Read)))
def test_read_write_tfrecords(self, filename): protos, path = self.write_test_protos(filename) # Create our generator of records from read_tfrecords. reader = tfrecord.read_tfrecords(path, reference_pb2.ContigInfo) # Make sure it's actually a generator. self.assertEqual(type(reader), types.GeneratorType) # Check the round-trip contents. if '@' in filename: # Sharded outputs are striped across shards, so order isn't preserved. self.assertCountEqual(protos, reader) else: self.assertEqual(protos, list(reader))
def test_writing_canned_records(self): """Tests writing all the records that are 'canned' in our tfrecord file.""" # This file is in TFRecord format. tfrecord_file = test_utils.genomics_core_testdata( 'test_features.gff.tfrecord') writer_options = gff_pb2.GffWriterOptions() gff_records = list( tfrecord.read_tfrecords(tfrecord_file, proto=gff_pb2.GffRecord)) out_fname = test_utils.test_tmpfile('output.gff') with gff_writer.GffWriter.to_file(out_fname, self.header, writer_options) as writer: for record in gff_records: writer.write(record) with open(out_fname) as f: self.assertEqual(f.readlines(), self.expected_gff_content)
def test_writing_canned_records(self): """Tests writing all the variants that are 'canned' in our tfrecord file.""" # This file is in TFRecord format. tfrecord_file = test_utils.genomics_core_testdata( 'test_reads.fastq.tfrecord') writer_options = fastq_pb2.FastqWriterOptions() fastq_records = list( tfrecord.read_tfrecords(tfrecord_file, proto=fastq_pb2.FastqRecord)) out_fname = test_utils.test_tmpfile('output.fastq') with fastq_writer.FastqWriter.to_file(out_fname, writer_options) as writer: for record in fastq_records: writer.write(record) with gfile.GFile(out_fname, 'r') as f: self.assertEqual(f.readlines(), self.expected_fastq_content)
def test_writing_canned_records(self): """Tests writing all the records that are 'canned' in our tfrecord file.""" # This file is in TFRecord format. tfrecord_file = test_utils.genomics_core_testdata( 'test_regions.bed.tfrecord') header = bed_pb2.BedHeader(num_fields=12) writer_options = bed_pb2.BedWriterOptions() bed_records = list( tfrecord.read_tfrecords(tfrecord_file, proto=bed_pb2.BedRecord)) out_fname = test_utils.test_tmpfile('output.bed') with bed_writer.BedWriter.to_file(out_fname, header, writer_options) as writer: for record in bed_records: writer.write(record) with gfile.Open(out_fname, 'r') as f: self.assertEqual(f.readlines(), self.expected_bed_content)
def test_writing_canned_variants(self): """Tests writing all the variants that are 'canned' in our tfrecord file.""" # This file is in the TF record format tfrecord_file = test_utils.genomics_core_testdata( 'test_samples.vcf.golden.tfrecord') writer_options = variants_pb2.VcfWriterOptions() header = variants_pb2.VcfHeader( contigs=[ reference_pb2.ContigInfo(name='chr1', n_bases=248956422), reference_pb2.ContigInfo(name='chr2', n_bases=242193529), reference_pb2.ContigInfo(name='chr3', n_bases=198295559), reference_pb2.ContigInfo(name='chrX', n_bases=156040895) ], sample_names=['NA12878_18_99'], filters=[ variants_pb2.VcfFilterInfo(id='PASS', description='All filters passed'), variants_pb2.VcfFilterInfo(id='LowQual', description=''), variants_pb2.VcfFilterInfo(id='VQSRTrancheINDEL95.00to96.00'), variants_pb2.VcfFilterInfo(id='VQSRTrancheINDEL96.00to97.00'), variants_pb2.VcfFilterInfo(id='VQSRTrancheINDEL97.00to99.00'), variants_pb2.VcfFilterInfo(id='VQSRTrancheINDEL99.00to99.50'), variants_pb2.VcfFilterInfo(id='VQSRTrancheINDEL99.50to99.90'), variants_pb2.VcfFilterInfo(id='VQSRTrancheINDEL99.90to99.95'), variants_pb2.VcfFilterInfo( id='VQSRTrancheINDEL99.95to100.00+'), variants_pb2.VcfFilterInfo(id='VQSRTrancheINDEL99.95to100.00'), variants_pb2.VcfFilterInfo(id='VQSRTrancheSNP99.50to99.60'), variants_pb2.VcfFilterInfo(id='VQSRTrancheSNP99.60to99.80'), variants_pb2.VcfFilterInfo(id='VQSRTrancheSNP99.80to99.90'), variants_pb2.VcfFilterInfo(id='VQSRTrancheSNP99.90to99.95'), variants_pb2.VcfFilterInfo(id='VQSRTrancheSNP99.95to100.00+'), variants_pb2.VcfFilterInfo(id='VQSRTrancheSNP99.95to100.00'), ], infos=[ variants_pb2.VcfInfo( id='END', number='1', type='Integer', description='Stop position of the interval') ], formats=[ variants_pb2.VcfFormatInfo(id='GT', number='1', type='String', description='Genotype'), variants_pb2.VcfFormatInfo(id='GQ', number='1', type='Integer', description='Genotype Quality'), variants_pb2.VcfFormatInfo( id='DP', number='1', type='Integer', description='Read depth of all passing filters reads.'), variants_pb2.VcfFormatInfo( id='MIN_DP', number='1', type='Integer', description='Minimum DP observed within the GVCF block.'), variants_pb2.VcfFormatInfo( id='AD', number='R', type='Integer', description= 'Read depth of all passing filters reads for each allele.' ), variants_pb2.VcfFormatInfo( id='VAF', number='A', type='Float', description='Variant allele fractions.'), variants_pb2.VcfFormatInfo( id='PL', number='G', type='Integer', description='Genotype likelihoods, Phred encoded'), ], ) variant_records = list( tfrecord.read_tfrecords(tfrecord_file, proto=variants_pb2.Variant)) out_fname = test_utils.test_tmpfile('output.vcf') with vcf_writer.VcfWriter.to_file(out_fname, header, writer_options) as writer: for record in variant_records[:5]: writer.write(record) # Check: are the variants written as expected? # pylint: disable=line-too-long expected_vcf_content = [ '##fileformat=VCFv4.2\n', '##FILTER=<ID=PASS,Description="All filters passed">\n', '##FILTER=<ID=LowQual,Description="">\n', '##FILTER=<ID=VQSRTrancheINDEL95.00to96.00,Description="">\n', '##FILTER=<ID=VQSRTrancheINDEL96.00to97.00,Description="">\n', '##FILTER=<ID=VQSRTrancheINDEL97.00to99.00,Description="">\n', '##FILTER=<ID=VQSRTrancheINDEL99.00to99.50,Description="">\n', '##FILTER=<ID=VQSRTrancheINDEL99.50to99.90,Description="">\n', '##FILTER=<ID=VQSRTrancheINDEL99.90to99.95,Description="">\n', '##FILTER=<ID=VQSRTrancheINDEL99.95to100.00+,Description="">\n', '##FILTER=<ID=VQSRTrancheINDEL99.95to100.00,Description="">\n', '##FILTER=<ID=VQSRTrancheSNP99.50to99.60,Description="">\n', '##FILTER=<ID=VQSRTrancheSNP99.60to99.80,Description="">\n', '##FILTER=<ID=VQSRTrancheSNP99.80to99.90,Description="">\n', '##FILTER=<ID=VQSRTrancheSNP99.90to99.95,Description="">\n', '##FILTER=<ID=VQSRTrancheSNP99.95to100.00+,Description="">\n', '##FILTER=<ID=VQSRTrancheSNP99.95to100.00,Description="">\n', '##INFO=<ID=END,Number=1,Type=Integer,Description="Stop position of ' 'the interval">\n', '##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n', '##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality">\n', '##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Read depth of all ' 'passing filters reads.">\n', '##FORMAT=<ID=MIN_DP,Number=1,Type=Integer,Description="Minimum DP ' 'observed within the GVCF block.">\n', '##FORMAT=<ID=AD,Number=R,Type=Integer,Description="Read depth of all ' 'passing filters reads for each allele.">\n', '##FORMAT=<ID=VAF,Number=A,Type=Float,Description=\"Variant allele ' 'fractions.">\n', '##FORMAT=<ID=PL,Number=G,Type=Integer,Description="Genotype ' 'likelihoods, Phred encoded">\n', '##contig=<ID=chr1,length=248956422>\n', '##contig=<ID=chr2,length=242193529>\n', '##contig=<ID=chr3,length=198295559>\n', '##contig=<ID=chrX,length=156040895>\n', '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tNA12878_18_99\n', 'chr1\t13613\t.\tT\tA\t39.88\tVQSRTrancheSNP99.90to99.95\t.\tGT:GQ:DP:AD:PL\t0/1:16:4:1,3:68,0,16\n', 'chr1\t13813\t.\tT\tG\t90.28\tPASS\t.\tGT:GQ:DP:AD:PL\t1/1:9:3:0,3:118,9,0\n', 'chr1\t13838\trs28428499\tC\tT\t62.74\tPASS\t.\tGT:GQ:DP:AD:PL\t1/1:6:2:0,2:90,6,0\n', 'chr1\t14397\trs756427959\tCTGT\tC\t37.73\tPASS\t.\tGT:GQ:DP:AD:PL\t0/1:75:5:3,2:75,0,152\n', 'chr1\t14522\t.\tG\tA\t49.77\tVQSRTrancheSNP99.60to99.80\t.\tGT:GQ:DP:AD:PL\t0/1:78:10:6,4:78,0,118\n' ] # pylint: enable=line-too-long with gfile.GFile(out_fname, 'r') as f: self.assertEqual(f.readlines(), expected_vcf_content)