def test_var_subtype(self): reader = cyvcf.Reader(fh('example-4.0.vcf')) for var in reader: subtype = var.var_subtype if var.POS == 14370: self.assertEqual("ts", subtype) if var.POS == 17330: self.assertEqual("tv", subtype) if var.POS == 1110696: self.assertEqual("unknown", subtype) if var.POS == 1230237: self.assertEqual("del", subtype) elif var.POS == 1234567: self.assertEqual("unknown", subtype) reader.close() # SV tests reader = cyvcf.Reader(fh('example-4.1-sv.vcf')) for var in reader: subtype = var.var_subtype if var.POS == 2827693: self.assertEqual("DEL", subtype) if var.POS == 321682: self.assertEqual("DEL", subtype) if var.POS == 14477084: self.assertEqual("DEL:ME:ALU", subtype) if var.POS == 9425916: self.assertEqual("INS:ME:L1", subtype) elif var.POS == 12665100: self.assertEqual("DUP", subtype) elif var.POS == 18665128: self.assertEqual("DUP:TANDEM", subtype) reader.close()
def test_walk(self): # easy case: all same sites reader1 = cyvcf.Reader(fh('example-4.0.vcf')) reader2 = cyvcf.Reader(fh('example-4.0.vcf')) reader3 = cyvcf.Reader(fh('example-4.0.vcf')) n = 0 for x in utils.walk_together(reader1, reader2, reader3): assert len(x) == 3 assert (x[0] == x[1]) and (x[1] == x[2]) n += 1 assert n == 5 # artificial case 2 from the left, 2 from the right, 2 together, 1 from the right, 1 from the left expected = 'llrrttrl' reader1 = cyvcf.Reader(fh('walk_left.vcf')) reader2 = cyvcf.Reader(fh('example-4.0.vcf')) for ex, recs in zip(expected, utils.walk_together(reader1, reader2)): if ex == 'l': assert recs[0] is not None assert recs[1] is None if ex == 'r': assert recs[1] is not None assert recs[0] is None if ex == 't': assert recs[0] is not None assert recs[1] is not None
def test_sv_end(self): reader = cyvcf.Reader(fh('example-4.1-sv.vcf')) for var in reader: sv_end = var.sv_end if var.POS == 2827693: self.assertEqual(2827680, sv_end) if var.POS == 321682: self.assertEqual(321887, sv_end) if var.POS == 14477084: self.assertEqual(14477381, sv_end) if var.POS == 9425916: self.assertEqual(9425916, sv_end) elif var.POS == 12665100: self.assertEqual(12686200, sv_end) elif var.POS == 18665128: self.assertEqual(18665204, sv_end) reader.close() reader = cyvcf.Reader(fh('example-4.0.vcf')) for var in reader: sv_end = var.sv_end if var.POS == 14370: self.assertEqual(None, sv_end) if var.POS == 17330: self.assertEqual(None, sv_end) if var.POS == 1110696: self.assertEqual(None, sv_end) if var.POS == 1230237: self.assertEqual(None, sv_end) elif var.POS == 1234567: self.assertEqual(None, sv_end) reader.close()
def test_var_type(self): reader = cyvcf.Reader(fh('example-4.0.vcf')) for var in reader: type = var.var_type if var.POS == 14370: self.assertEqual("snp", type) if var.POS == 17330: self.assertEqual("snp", type) if var.POS == 1110696: self.assertEqual("snp", type) if var.POS == 1230237: self.assertEqual("indel", type) elif var.POS == 1234567: self.assertEqual("indel", type) reader.close() # SV tests reader = cyvcf.Reader(fh('example-4.1-sv.vcf')) for var in reader: type = var.var_type if var.POS == 2827693: self.assertEqual("sv", type) if var.POS == 321682: self.assertEqual("sv", type) if var.POS == 14477084: self.assertEqual("sv", type) if var.POS == 9425916: self.assertEqual("sv", type) elif var.POS == 12665100: self.assertEqual("sv", type) elif var.POS == 18665128: self.assertEqual("sv", type) reader.close()
def test_is_sv_precise(self): reader = cyvcf.Reader(fh('example-4.1-sv.vcf')) for var in reader: is_precise = var.is_sv_precise if var.POS == 2827693: self.assertEqual(True, is_precise) if var.POS == 321682: self.assertEqual(False, is_precise) if var.POS == 14477084: self.assertEqual(False, is_precise) if var.POS == 9425916: self.assertEqual(False, is_precise) elif var.POS == 12665100: self.assertEqual(False, is_precise) elif var.POS == 18665128: self.assertEqual(False, is_precise) reader.close() reader = cyvcf.Reader(fh('example-4.0.vcf')) for var in reader: is_precise = var.is_sv_precise if var.POS == 14370: self.assertEqual(False, is_precise) if var.POS == 17330: self.assertEqual(False, is_precise) if var.POS == 1110696: self.assertEqual(False, is_precise) if var.POS == 1230237: self.assertEqual(False, is_precise) elif var.POS == 1234567: self.assertEqual(False, is_precise) reader.close()
def test_null_mono(self): # null qualities were written as blank, causing subsequent parse to fail print os.path.abspath( os.path.join(os.path.dirname(__file__), 'null_genotype_mono.vcf')) p = cyvcf.Reader(fh('null_genotype_mono.vcf')) assert p.samples out = StringIO() writer = cyvcf.Writer(out, p) map(writer.write_record, p) out.seek(0) print out.getvalue() p2 = cyvcf.Reader(out) rec = p2.next() assert rec.samples
def test_num_calls(self): reader = cyvcf.Reader(fh('example-4.0.vcf')) for var in reader: num_calls = (var.num_hom_ref + var.num_hom_alt + \ var.num_het + var.num_unknown) self.assertEqual(len(var.samples), num_calls) reader.close()
def testApplyFilter(self): s, out = subprocess.getstatusoutput( 'python scripts/vcf_filter.py --site-quality 30 test/example-4.0.vcf sq' ) #print out assert s == 0 buf = StringIO() buf.write(out) buf.seek(0) print(buf.getvalue()) reader = cyvcf.Reader(buf) # check filter got into output file assert 'sq30' in reader.filters print(reader.filters) # check sites were filtered n = 0 for r in reader: if r.QUAL < 30: assert 'sq30' in r.FILTER n += 1 else: assert r.FILTER is None or 'sq30' not in r.FILTER assert n == 2
def testWrite(self): reader = cyvcf.Reader(fh('gatk.vcf')) out = StringIO() writer = cyvcf.Writer(out, reader) records = list(reader) map(writer.write_record, records) out.seek(0) reader2 = cyvcf.Reader(out) self.assertEquals(reader.samples, reader2.samples) self.assertEquals(reader.formats, reader2.formats) self.assertEquals(reader.infos, reader2.infos) for l, r in zip(records, reader2): self.assertEquals(l.samples, r.samples)
def testParse(self): reader = cyvcf.Reader(fh('freebayes.vcf')) print(reader.samples) self.assertEqual(len(reader.samples), 7) n = 0 for r in reader: n += 1 for x in r: assert x assert n == self.n_calls
def test_gt_alt_depths(self): reader = cyvcf.Reader(fh('gatk.vcf')) for var in reader: gt_alt_depths = [s.gt_alt_depth for s in var.samples] if var.POS == 42522392: self.assertEqual([0, 107, 77, 0, 1, 0, 0], gt_alt_depths) elif var.POS == 42522613: self.assertEqual([4, 127, 0, 85, 132, 135, 126], gt_alt_depths) elif var.POS == 42527891: self.assertEqual([-1, 7, 3, 11, 16, 14, 11], gt_alt_depths)
def test_gt_quals(self): reader = cyvcf.Reader(fh('gatk.vcf')) for var in reader: gt_quals = [s.gt_qual for s in var.samples] if var.POS == 42522392: self.assertEqual([18.04, 99, 99, 99, 99, 99, 99], gt_quals) elif var.POS == 42522613: self.assertEqual([62.64, 99, 99, 99, 99, 99, 99], gt_quals) elif var.POS == 42527891: self.assertEqual([-1, 13.70, 5.97, 31.42, 49.09, 52.10, 12.71], gt_quals)
def test_gt_bases(self): reader = cyvcf.Reader(fh('example-4.0.vcf')) for var in reader: gt_bases = [s.gt_bases for s in var.samples] if var.POS == 14370: self.assertEqual(['G|G', 'A|G', 'A/A'], gt_bases) elif var.POS == 17330: self.assertEqual(['T|T', 'T|A', 'T/T'], gt_bases) elif var.POS == 1110696: self.assertEqual(['G|T', 'T|G', 'T/T'], gt_bases) elif var.POS == 1230237: self.assertEqual(['T|T', 'T|T', 'T/T'], gt_bases) elif var.POS == 1234567: self.assertEqual([None, 'GTCT/GTACT', 'G/G'], gt_bases)
def test_call_rate(self): reader = cyvcf.Reader(fh('example-4.0.vcf')) for var in reader: call_rate = var.call_rate if var.POS == 14370: self.assertEqual(3.0 / 3.0, call_rate) if var.POS == 17330: self.assertEqual(3.0 / 3.0, call_rate) if var.POS == 1110696: self.assertEqual(3.0 / 3.0, call_rate) if var.POS == 1230237: self.assertEqual(3.0 / 3.0, call_rate) elif var.POS == 1234567: self.assertEqual(2.0 / 3.0, call_rate)
def test_aaf(self): reader = cyvcf.Reader(fh('example-4.0.vcf')) for var in reader: aaf = var.aaf if var.POS == 14370: self.assertEqual(3.0 / 6.0, aaf) if var.POS == 17330: self.assertEqual(1.0 / 6.0, aaf) if var.POS == 1110696: self.assertEqual(None, aaf) if var.POS == 1230237: self.assertEqual(0.0 / 6.0, aaf) elif var.POS == 1234567: self.assertEqual(None, aaf)
def test_pi(self): reader = cyvcf.Reader(fh('example-4.0.vcf')) for var in reader: pi = var.nucl_diversity if var.POS == 14370: self.assertEqual(6.0 / 10.0, pi) if var.POS == 17330: self.assertEqual(1.0 / 3.0, pi) if var.POS == 1110696: self.assertEqual(None, pi) if var.POS == 1230237: self.assertEqual(0.0 / 6.0, pi) elif var.POS == 1234567: self.assertEqual(None, pi)
def test_is_deletion(self): reader = cyvcf.Reader(fh('example-4.0.vcf')) for var in reader: is_del = var.is_deletion if var.POS == 14370: self.assertEqual(False, is_del) if var.POS == 17330: self.assertEqual(False, is_del) if var.POS == 1110696: self.assertEqual(False, is_del) if var.POS == 1230237: self.assertEqual(True, is_del) elif var.POS == 1234567: self.assertEqual(False, is_del)
def test_gt_ref_depths(self): reader = cyvcf.Reader(fh('gatk.vcf')) for var in reader: gt_ref_depths = [s.gt_ref_depth for s in var.samples] if var.POS == 42522392: self.assertEqual([6, 138, 169, 249, 248, 250, 250], gt_ref_depths) elif var.POS == 42522613: self.assertEqual([13, 118, 241, 161, 110, 106, 116], gt_ref_depths) elif var.POS == 42527891: self.assertEqual([-1, 238, 246, 239, 232, 233, 238], gt_ref_depths)
def use_cyvcf(vcf_file): """ Not installing (py3 not supported, py2 installs but import doesn't work) """ import cyvcf # need to reinstall instead of pyvcf import gzip f = gzip.open(vcf_file) if vcf_file.endswith('.gz') else open(vcf_file) with f as f: vcf_reader = cyvcf.Reader(f) vcf_writer = cyvcf.Writer(sys.stdout, vcf_reader) for rec in vcf_reader: msi_fail = proc_fields(rec.REF, rec.ALT[0], rec.samples[0]['AF'], rec.INFO['MSI']) if msi_fail: rec.FILTER.append('MSI_FAIL') vcf_writer.write_record(rec)
def test_is_snp(self): reader = cyvcf.Reader(fh('example-4.0.vcf')) for var in reader: is_snp = var.is_snp if var.POS == 14370: self.assertEqual(True, is_snp) if var.POS == 17330: self.assertEqual(True, is_snp) if var.POS == 1110696: self.assertEqual(True, is_snp) if var.POS == 1230237: self.assertEqual(False, is_snp) elif var.POS == 1234567: self.assertEqual(False, is_snp) reader.close()
def test_phased(self): reader = cyvcf.Reader(fh('example-4.0.vcf')) for var in reader: phases = var.gt_phases print(var) if var.POS == 14370: self.assertEqual([True, True, False], phases) if var.POS == 17330: self.assertEqual([True, True, False], phases) if var.POS == 1110696: self.assertEqual([True, True, False], phases) if var.POS == 1230237: self.assertEqual([True, True, False], phases) elif var.POS == 1234567: self.assertEqual([False, False, False], phases)
def testApplyMultipleFilters(self): s, out = subprocess.getstatusoutput( 'python scripts/vcf_filter.py --site-quality 30 ' '--genotype-quality 50 test/example-4.0.vcf sq mgq') assert s == 0 #print out buf = StringIO() buf.write(out) buf.seek(0) reader = cyvcf.Reader(buf) print(reader.filters) assert 'mgq50' in reader.filters assert 'sq30' in reader.filters
def test_vcf_4_1_sv(self): return reader = cyvcf.Reader(fh('example-4.1-sv.vcf')) assert 'SVLEN' in reader.infos # test we can walk the file at least for r in reader: print(r) for c in r: print(c) assert c # asserting False while I work out what to check assert False
def test_vcf_4_1(self): return reader = cyvcf.Reader(fh('example-4.1.vcf')) self.assertEqual(reader.metadata['fileformat'], 'VCFv4.1') # contigs were added in vcf4.1 # probably need to add a reader.contigs attribute assert 'contig' in reader.metadata # test we can walk the file at least for r in reader: for c in r: assert c # asserting False while I work out what to check assert False
def test_gt_types(self): reader = cyvcf.Reader(fh('example-4.0.vcf')) for var in reader: for s in var: print(s.data) gt_types = [s.gt_type for s in var.samples] if var.POS == 14370: self.assertEqual([0, 1, 3], gt_types) elif var.POS == 17330: self.assertEqual([0, 1, 0], gt_types) elif var.POS == 1110696: self.assertEqual([1, 1, 3], gt_types) elif var.POS == 1230237: self.assertEqual([0, 0, 0], gt_types) elif var.POS == 1234567: self.assertEqual([None, 1, 3], gt_types)
def test_gt_depths(self): reader = cyvcf.Reader(fh('example-4.0.vcf')) for var in reader: for s in var: print(s.data) gt_depths = [s.gt_depth for s in var.samples] if var.POS == 14370: self.assertEqual([1, 8, 5], gt_depths) elif var.POS == 17330: self.assertEqual([3, 5, 3], gt_depths) elif var.POS == 1110696: self.assertEqual([6, 0, 4], gt_depths) elif var.POS == 1230237: self.assertEqual([7, 4, 2], gt_depths) elif var.POS == 1234567: self.assertEqual([4, 2, 3], gt_depths)
def check_existing_vcf_info_tags(input_vcf, pcgr_directory, logger): """ Function that compares the INFO tags in the query VCF and the INFO tags generated by PCGR If any coinciding tags, an error will be returned """ vep_infotags_desc = pcgrutils.read_infotag_file( os.path.join(pcgr_directory, 'data', 'vep_infotags.tsv')) pcgr_infotags_desc = pcgrutils.read_infotag_file( os.path.join(pcgr_directory, 'data', 'pcgr_infotags.tsv')) vcfanno_tags = {} for db in [ 'intogen_driver_mut', 'dbsnp', 'oneKG', 'docm', 'exac', 'gnomad', 'civic', 'cbmdb', 'dbnsfp', 'clinvar', 'icgc', 'cosmic' ]: vcfanno_tag_file = os.path.join(pcgr_directory, 'data', str(db), str(db) + '.vcfanno.vcf_info_tags.txt') try: f = open(vcfanno_tag_file, 'r') for line in f: if line.startswith('##INFO'): tag = re.sub(r'##INFO=<ID=', '', str(line.rstrip().split(',')[0])) vcfanno_tags[tag] = 1 except IOError: logger.error('File ' + str(vcfanno_tag_file) + ' does not exist') vcf_reader = cyvcf.Reader(open(input_vcf, 'r')) logger.info( 'Checking if existing INFO tags of query VCF file coincide with PCGR INFO tags' ) ret = 1 for k in vcf_reader.infos.keys(): if k in vep_infotags_desc.keys() or k in pcgr_infotags_desc.keys( ) or k in vcfanno_tags.keys() or k == 'EFFECT_PREDICTIONS': logger.error( 'INFO tag ' + str(k) + ' in the query VCF coincides with a VCF annotation tag produced by PCGR - please remove or rename this tag in your query VCF' ) ret = -1 return ret
def test_vcf_4_0(self): reader = cyvcf.Reader(fh('example-4.0.vcf')) assert reader.metadata['fileformat'] == 'VCFv4.0' # test we can walk the file at least for r in reader: if r.POS == 1230237: assert r.is_monomorphic else: assert not r.is_monomorphic if 'AF' in r.INFO: self.assertEqual(type(r.INFO['AF']), type([])) for c in r: assert c # issue 19, in the example ref the GQ is length 1 if c.called: self.assertEqual(type(c.data['GQ']), type(1)) if 'HQ' in c.data and c.data['HQ'] is not None: self.assertEqual(type(c.data['HQ']), type([]))
#!/usr/bin/python import sys import cyvcf if __name__ == "__main__": if len(sys.argv) != 2: print "Usage: python %s <in.vcf>" % sys.argv[0] sys.exit() invcf = sys.argv[1] vcf_reader = cyvcf.Reader(open(invcf, 'rb')) for record in vcf_reader: print record
def vcf2tsv(query_vcf, out_tsv): vcf_reader = cyvcf.Reader(open(query_vcf, 'r')) f_out = open(out_tsv, 'w') header_line = get_tsv_header(vcf_reader) f_out.write(header_line + '\n') for rec in vcf_reader: alt = ",".join(str(n) for n in rec.ALT) qual = '.' rec_id = str(rec.ID) if rec.ID is None: rec_id = '.' if not rec.QUAL is None: qual = str(rec.QUAL) rec_filter = str(rec.FILTER) if type(rec.FILTER) is list: if len(rec.FILTER) == 0: rec_filter = 'PASS' elif len(rec.FILTER) == 1: rec_filter = str(rec.FILTER[0]) else: rec_filter = str(';'.join(str(n) for n in rec.FILTER)) else: if rec.FILTER is None: rec_filter = 'PASS' fixed_fields_string = str(rec.CHROM) + '\t' + str( rec.POS) + '\t' + str(rec_id) + '\t' + str(rec.REF) + '\t' + str( alt) + '\t' + str(qual) + '\t' + str(rec_filter) infocolumn = [] for keyw in sorted(vcf_reader.infos.keys()): if rec.INFO.has_key(keyw): #print str(keyw) + '\t' + str(rec.INFO[keyw]) if vcf_reader.infos[keyw][2] == 'Flag': if rec.INFO[keyw] == 1 or rec.INFO[keyw] == True: infocolumn.append('True') else: infocolumn.append('False') elif vcf_reader.infos[str( keyw)][2] == 'Float' or vcf_reader.infos[str( keyw)][2] == 'Integer': if type(rec.INFO[str(keyw)]) is list: infocolumn.append(",".join( str(n) for n in rec.INFO[str(keyw)])) else: if re.search(r'^\[(.|\s)+\]$', str(rec.INFO[keyw])): tmp = re.sub(r'\[|\]|\'|\s{1,}', '', rec.INFO[keyw]) infocolumn.append(str(tmp)) else: infocolumn.append(str(rec.INFO[str(keyw)])) else: if type(rec.INFO[keyw]) is list: all_vals = [] for m in rec.INFO[str(keyw)]: if m is None: all_vals.append('NA') else: all_vals.append(m) infocolumn.append(str(','.join(all_vals))) else: ## For some reason(?) some lists are parsed into string objects by PyVCF if re.search(r'^\[(.|\s)+\]$', str(rec.INFO[keyw])): #print str(rec.INFO[keyw]) tmp = re.sub(r'\[|\]|\'|\s{1,}', '', rec.INFO[keyw]) #print str(tmp) infocolumn.append(str(tmp)) else: infocolumn.append(str(rec.INFO[str(keyw)])) else: infocolumn.append('NA') infostring = '\t'.join(infocolumn) i = 0 if len(vcf_reader.samples) > 0: while i < len(vcf_reader.samples): genotype_values = [] skip_null_sample = False depth_tumor_non_zero = False depth_control_non_zero = False for elem in sorted(rec.FORMAT.split(':')): p = rec.samples[int(i)][str(elem)] if isinstance(p, list): genotype_values.append(','.join(map(str, p))) else: if not p: if str(elem) == 'GT': skip_null_sample = True genotype_values.append('./.') else: if p is None: p = '.' genotype_values.append(str(p)) else: if elem == 'GT' and p == './.': skip_null_sample = True genotype_values.append(str(p)) sample_genotype_data = '\t'.join(genotype_values) if skip_null_sample is False: tsvline = fixed_fields_string + '\t' + infostring + '\t' + str( vcf_reader.samples[i] ) + '\t' + sample_genotype_data + '\n' f_out.write(tsvline) i = i + 1 else: tsvline = fixed_fields_string + '\t' + infostring + '\n' f_out.write(tsvline) f_out.close()