def test_vcf_to_csv(): import vcf, csv vcfreader = vcf.Reader(open('tests/data/snv.vcf')) columns = ['sample:DP', 'info:VAF', 'info:DP', 'something'] filename = vcf_to_csv(vcfreader, columns, None, default_values={'something': True}) with open(filename) as fd: rows = list(csv.reader(fd)) asserts.eq_(rows[0], ['44', '', '81', 'True']) asserts.eq_(rows[-1], ['40', '', '74', 'True']) asserts.eq_(len(rows), 20)
def test_vcf_to_csv(): vcfreader = vcf.Reader(open('tests/data/snv.vcf')) columns = ['contig', 'position', 'sample:DP', 'info:VAF', 'info:DP', 'something'] filename = vcf_to_csv(vcfreader, columns, None, default_values={'something': 'Something!'}) with open(filename) as f: rows = list(csv.reader(f)) # first two are from the same sample assert rows[0] == ['20', '61795', '44', '', '81', 'Something!'] assert rows[1] == ['20', '61795', '37', '', '81', 'Something!'] # this is the last sample assert rows[-1] == ['20', '75254', '40', '', '74', 'Something!'] assert len(rows) == 20 # 10 records, 2 samples each = 20 rows
def test_csv_to_vcf(): # Here we just write the VCF to a CSV like we did above, and then read it # and compare it to the original VCF to make sure we haven't lost any data # in the conversation. vcfreader = vcf.Reader(open('tests/data/one-sample.vcf')) columns = ['contig', 'position', 'id', 'reference', 'alternates', 'quality', 'filters', 'info:DP', 'info:SS', 'info:SSC', 'info:GPV', 'info:SPV', 'info:SOMATIC', 'sample:GT','sample:GQ','sample:DP','sample:RD', 'sample:AD','sample:FREQ','sample:DP4', 'sample_name'] filename = vcf_to_csv(vcfreader, columns, None) # Now we convert the rows in this CSV into dict (converting the position as # well), which is what it would look like select from Postgres. with open(filename) as f: rows = list(csv.reader(f)) relations = [] for row in rows: relation = { col: val if val != '' else None for col, val in zip(columns, row) } relation['position'] = int(relation['position']) relations.append(relation) text = open('tests/data/one-sample.vcf').readlines() header = (line for line in text if line.startswith('#')) template = vcf.Reader(header) # Here we convert those dicts to vcf.Records, for later writing. records = genotypes_to_records(relations, template, columns) # Now we write the output of those to a string buffer... vcf_sb = StringIO.StringIO() writer = vcf.Writer(vcf_sb, template) for record in records: writer.write_record(record) # And see if we get the same VCF. # # We can't just e.g. test that the text of # both files is equal, as pyVCF will e.g. emit header lines in different # orders, or e.g. include fields in the INFO field which may be blank and # thus omitted in the original VCF. Scientific notation, for example, also # can change, e.g. 1.05 can become 0.105e-1. # # So, instead, we test that the same number of records, exists, that they # all have the correct CHROM, POS, REF, ALT and select sample and info # fields. original_vcf = list(vcf.Reader(open('tests/data/one-sample.vcf'))) vcf_sb.seek(0) saved_vcf = list(vcf.Reader(vcf_sb)) for o, s in zip(original_vcf, saved_vcf): assert o.CHROM == s.CHROM assert o.POS == s.POS assert o.REF == s.REF assert o.ALT == s.ALT assert o.samples[0].data.DP == s.samples[0].data.DP assert o.samples[0].data.GT == s.samples[0].data.GT assert o.samples[0].data.DP4 == s.samples[0].data.DP4 assert o.INFO['DP'] == s.INFO['DP'] assert o.INFO['SS'] == s.INFO['SS']