def test_round_trip_vcf(self, test_datum_name): # Round-trip variants through writing and reading: # 1. Read variants v1 from VcfReader; # 2. Write v1 to vcf using our VcfWriter; # 3. Read back in using VcfReader -- v2; # 4. compare v1 and v2. in_file = test_utils.genomics_core_testdata(test_datum_name) out_file = test_utils.test_tmpfile('output_' + test_datum_name) v1_reader = genomics_io.make_vcf_reader(in_file, use_index=False) v1_records = list(v1_reader.iterate()) self.assertTrue(v1_records, 'Reader failed to find records') writer_options = core_pb2.VcfWriterOptions( contigs=v1_reader.contigs, sample_names=v1_reader.samples, filters=v1_reader.filters) with vcf_writer.VcfWriter.to_file(out_file, writer_options) as writer: for record in v1_records: writer.write(record) v2_reader = genomics_io.make_vcf_reader(out_file, use_index=False) v2_records = list(v2_reader.iterate()) self.assertEqual(v1_records, v2_records, 'Round-tripped variants not as expected')
def write_variant_to_tempfile(self, variant): path = test_utils.test_tmpfile('test.vcf') writer = genomics_io.make_vcf_writer( outfile=path, contigs=[core_pb2.ContigInfo(name='20')], samples=[call.call_set_name for call in variant.calls], filters=[]) with writer: writer.write(variant) return path
def _parse_read_with_aux_tags(self, tag_string): # Minimal header line to create a valid SAM file. header_lines = '@HD VN:1.3 SO:coordinate\n@SQ SN:chr1 LN:248956422\n' # A single stock read we'll add our AUX fields to. read = 'read_name 0 chr1 1 0 3M * 0 0 CCC AAA ' + tag_string path = test_utils.test_tmpfile('aux_tags.bam') with tf.gfile.FastGFile(path, 'w') as fout: fout.write(header_lines) fout.write(read + '\n') with genomics_io.make_sam_reader( path, use_index=False, parse_aux_fields=True) as reader: return list(reader.iterate())
def setUp(self): self.out_fname = test_utils.test_tmpfile('output.vcf') self.options = core_pb2.VcfWriterOptions( contigs=[ core_pb2.ContigInfo(name='Chr1', n_bases=50, pos_in_fasta=0), core_pb2.ContigInfo(name='Chr2', n_bases=25, pos_in_fasta=1), ], sample_names=['Fido', 'Spot'], filters=[]) self.writer = vcf_writer.VcfWriter.to_file(self.out_fname, self.options) self.variant = test_utils.make_variant( chrom='Chr1', start=10, alleles=['A', 'C']) self.variant.calls.add(genotype=[0, 0], call_set_name='Fido') self.variant.calls.add(genotype=[0, 1], call_set_name='Spot')
def test_make_read_writer_tfrecords(self): outfile = test_utils.test_tmpfile('test.tfrecord') writer = genomics_io.make_read_writer(outfile=outfile) # Test that the writer is a context manager and that we can write a read to # it. with writer: writer.write(self.read1) writer.write(self.read2) # Our output should have exactly one read in it. self.assertEqual([self.read1, self.read2], list( io_utils.read_tfrecords(outfile, proto=reads_pb2.Read)))
def test_writing_canned_variants(self): """Tests writing all the variants that are 'canned' in our tfrecord file.""" # This file is in the TF record format tfrecord_file = test_utils.genomics_core_testdata( 'test_samples.vcf.golden.tfrecord') writer_options = core_pb2.VcfWriterOptions( contigs=[ core_pb2.ContigInfo(name='chr1', n_bases=248956422), core_pb2.ContigInfo(name='chr2', n_bases=242193529), core_pb2.ContigInfo(name='chr3', n_bases=198295559), core_pb2.ContigInfo(name='chrX', n_bases=156040895) ], sample_names=['NA12878_18_99'], filters=[ core_pb2.VcfFilterInfo(id='LowQual'), core_pb2.VcfFilterInfo(id='VQSRTrancheINDEL95.00to96.00'), core_pb2.VcfFilterInfo(id='VQSRTrancheINDEL96.00to97.00'), core_pb2.VcfFilterInfo(id='VQSRTrancheINDEL97.00to99.00'), core_pb2.VcfFilterInfo(id='VQSRTrancheINDEL99.00to99.50'), core_pb2.VcfFilterInfo(id='VQSRTrancheINDEL99.50to99.90'), core_pb2.VcfFilterInfo(id='VQSRTrancheINDEL99.90to99.95'), core_pb2.VcfFilterInfo(id='VQSRTrancheINDEL99.95to100.00+'), core_pb2.VcfFilterInfo(id='VQSRTrancheINDEL99.95to100.00'), core_pb2.VcfFilterInfo(id='VQSRTrancheSNP99.50to99.60'), core_pb2.VcfFilterInfo(id='VQSRTrancheSNP99.60to99.80'), core_pb2.VcfFilterInfo(id='VQSRTrancheSNP99.80to99.90'), core_pb2.VcfFilterInfo(id='VQSRTrancheSNP99.90to99.95'), core_pb2.VcfFilterInfo(id='VQSRTrancheSNP99.95to100.00+'), core_pb2.VcfFilterInfo(id='VQSRTrancheSNP99.95to100.00'), ]) variant_records = list( io_utils.read_tfrecords(tfrecord_file, proto=variants_pb2.Variant)) out_fname = test_utils.test_tmpfile('output.vcf') with vcf_writer.VcfWriter.to_file(out_fname, writer_options) as writer: for record in variant_records[:5]: writer.write(record) # Check: are the variants written as expected? # pylint: disable=line-too-long expected_vcf_content = [ '##fileformat=VCFv4.2\n', '##FILTER=<ID=PASS,Description="All filters passed">\n', '##FILTER=<ID=LowQual,Description="">\n', '##FILTER=<ID=VQSRTrancheINDEL95.00to96.00,Description="">\n', '##FILTER=<ID=VQSRTrancheINDEL96.00to97.00,Description="">\n', '##FILTER=<ID=VQSRTrancheINDEL97.00to99.00,Description="">\n', '##FILTER=<ID=VQSRTrancheINDEL99.00to99.50,Description="">\n', '##FILTER=<ID=VQSRTrancheINDEL99.50to99.90,Description="">\n', '##FILTER=<ID=VQSRTrancheINDEL99.90to99.95,Description="">\n', '##FILTER=<ID=VQSRTrancheINDEL99.95to100.00+,Description="">\n', '##FILTER=<ID=VQSRTrancheINDEL99.95to100.00,Description="">\n', '##FILTER=<ID=VQSRTrancheSNP99.50to99.60,Description="">\n', '##FILTER=<ID=VQSRTrancheSNP99.60to99.80,Description="">\n', '##FILTER=<ID=VQSRTrancheSNP99.80to99.90,Description="">\n', '##FILTER=<ID=VQSRTrancheSNP99.90to99.95,Description="">\n', '##FILTER=<ID=VQSRTrancheSNP99.95to100.00+,Description="">\n', '##FILTER=<ID=VQSRTrancheSNP99.95to100.00,Description="">\n', '##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n', '##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality">\n', '##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Read depth of all ' 'passing filters reads.">\n', '##FORMAT=<ID=AD,Number=R,Type=Integer,Description="Read depth of all ' 'passing filters reads for each allele.">\n', '##FORMAT=<ID=VAF,Number=A,Type=Float,Description=\"Variant allele ' 'fractions.">\n', '##FORMAT=<ID=GL,Number=G,Type=Float,Description="Genotype ' 'likelihoods, log10 encoded">\n', '##FORMAT=<ID=PL,Number=G,Type=Integer,Description="Genotype ' 'likelihoods, Phred encoded">\n', '##INFO=<ID=END,Number=1,Type=Integer,Description="Stop position of ' 'the interval">\n', '##contig=<ID=chr1,length=248956422>\n', '##contig=<ID=chr2,length=242193529>\n', '##contig=<ID=chr3,length=198295559>\n', '##contig=<ID=chrX,length=156040895>\n', '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tNA12878_18_99\n', 'chr1\t13613\t.\tT\tA\t39.88\tVQSRTrancheSNP99.90to99.95\t.\tGT:GQ:DP:AD:PL\t0/1:16:4:1,3:68,0,16\n', 'chr1\t13813\t.\tT\tG\t90.28\tPASS\t.\tGT:GQ:DP:AD:PL\t1/1:9:3:0,3:118,9,0\n', 'chr1\t13838\trs28428499\tC\tT\t62.74\tPASS\t.\tGT:GQ:DP:AD:PL\t1/1:6:2:0,2:90,6,0\n', 'chr1\t14397\trs756427959\tCTGT\tC\t37.73\tPASS\t.\tGT:GQ:DP:AD:PL\t0/1:75:5:3,2:75,0,152\n', 'chr1\t14522\t.\tG\tA\t49.77\tVQSRTrancheSNP99.60to99.80\t.\tGT:GQ:DP:AD:PL\t0/1:78:10:6,4:78,0,118\n' ] # pylint: enable=line-too-long with tf.gfile.GFile(out_fname, 'r') as f: self.assertEqual(f.readlines(), expected_vcf_content)
def write_test_protos(self, filename): protos = [core_pb2.ContigInfo(name=str(i)) for i in range(10)] path = test_utils.test_tmpfile(filename) io.write_tfrecords(protos, path) return protos, path