def test_get_sample_names(self): with vcf.VcfFile(self.example_vcf, 'r') as vcffile: vcffile.jump2variants() sample_names = vcffile.get_sample_names() self.assertEqual(len(sample_names), 3) self.assertTrue('NA00001' in sample_names) self.assertTrue('NA00002' in sample_names) self.assertTrue('NA00003' in sample_names)
def test_is_variant_line(self): with vcf.VcfFile(self.example_vcf, 'r') as vcffile: self.assertFalse(vcffile.is_variant_line(vcffile.readline())) self.assertFalse(vcffile.is_variant_line(vcffile.readline())) vcffile.jump2variants() self.assertTrue(vcffile.is_variant_line(vcffile.readline())) self.assertTrue( vcffile.is_variant_line( '20 14370 rs6054257 G A 29 PASS NS=3;DP=14;AF=0.5;DB;H2 GT:GQ:DP:HQ 0|0:48:1:51,51 1|0:48:8:51,51 1/1:43:5:.,.\n' ))
def test_jump2variants(self): with vcf.VcfFile(self.example_vcf, 'r') as vcffile: vcffile.jump2variants() # Test that column names are set self.assertEqual(vcffile.column_names, self.example_vcf_column_names) # Test that the next line is the first variant self.assertEqual( '20 14370 rs6054257 G A 29 PASS NS=3;DP=14;AF=0.5;DB;H2 GT:GQ:DP:HQ 0|0:48:1:51,51 1|0:48:8:51,51 1/1:43:5:.,.\n', vcffile.readline())
def test_set_column_names(self): with vcf.VcfFile(self.example_vcf, 'r') as vcffile: # Test that error is raised when trying to input non-header line self.assertRaises(ValueError, vcffile.set_column_names, (vcffile.readline())) # Check that the header line is correctly parsed vcffile.set_column_names(self.example_vcf_column_names_str) self.assertEqual(vcffile.column_names[0], 'CHROM') self.assertEqual(vcffile.column_names[1], 'POS') self.assertEqual(vcffile.column_names[7], 'INFO')
def test_parse_line(self): with vcf.VcfFile(self.example_vcf, 'r') as vcffile: vcffile.jump2variants() i = 0 for line in vcffile: variant = vcffile.parse_line(line) if i == 0: self.assertEqual(variant['CHROM'], '20') self.assertEqual(variant['ID'], 'rs6054257') self.assertEqual(variant['NA00003'], '1/1:43:5:.,.') elif i == 1: self.assertEqual(variant['INFO'], 'NS=3;DP=11;AF=0.017') self.assertEqual(variant['FORMAT'], 'GT:GQ:DP:HQ') self.assertEqual(variant['NA00003'], '0/0:41:3') break # exit loop after 2 iterations i += 1
def test_read_variant(self): with vcf.VcfFile(self.example_vcf, 'r') as vcffile: # Check that error is raised when reading non-variant line self.assertRaises(ValueError, vcffile.read_variant) # Check that the variant row is read in correctly vcffile.jump2variants() variant = vcffile.read_variant() self.assertEqual(variant['CHROM'], '20') self.assertEqual(variant['ID'], 'rs6054257') self.assertEqual(variant['NA00003'], '1/1:43:5:.,.') variant = vcffile.read_variant() self.assertEqual(variant['INFO'], 'NS=3;DP=11;AF=0.017') self.assertEqual(variant['FORMAT'], 'GT:GQ:DP:HQ') self.assertEqual(variant['NA00003'], '0/0:41:3')
def test_parse_info(self): with vcf.VcfFile(self.example_vcf, 'r') as vcffile: # Check that the variant row is read in correctly vcffile.jump2variants() variant = vcffile.read_variant() info_map, info_single = vcffile.parse_info(variant) self.assertEqual(info_map['AF'], '0.5') self.assertEqual(info_map['DP'], '14') self.assertEqual(len(info_single), 2) self.assertTrue('DB' in info_single) self.assertTrue('H2' in info_single) variant = vcffile.read_variant() info_map, info_single = vcffile.parse_info(variant) self.assertEqual(info_map['NS'], '3') self.assertEqual(info_map['AF'], '0.017')
def test_parse_samples(self): with vcf.VcfFile(self.example_vcf, 'r') as vcffile: vcffile.jump2variants() variant = vcffile.read_variant() sample2field2val = vcffile.parse_samples(variant) self.assertEqual(len(sample2field2val), 3) self.assertTrue('NA00001' in sample2field2val) self.assertTrue('NA00002' in sample2field2val) self.assertTrue('NA00003' in sample2field2val) self.assertEqual(sample2field2val['NA00001']['GT'], '0|0') self.assertEqual(sample2field2val['NA00002']['GQ'], '48') self.assertEqual(sample2field2val['NA00003']['HQ'], '.,.') variant = vcffile.read_variant() sample2field2val = vcffile.parse_samples(variant) self.assertEqual(sample2field2val['NA00001']['HQ'], '58,50') self.assertEqual(sample2field2val['NA00002']['GT'], '0|1') self.assertEqual(sample2field2val['NA00003']['GQ'], '41')
def test_get_sample_gt(self): with vcf.VcfFile(self.example_vcf, 'r') as vcffile: vcffile.jump2variants() variant = vcffile.read_variant() gt = vcffile.get_sample_gt(variant, 'NA00001', phased=True) self.assertEqual(gt, 'G|G') variant = vcffile.read_variant() gt = vcffile.get_sample_gt(variant, 'NA00002', phased=True) self.assertEqual(gt, 'T|A') variant = vcffile.read_variant() gt = vcffile.get_sample_gt(variant, 'NA00003', phased=False) self.assertEqual(gt, 'T/T') variant = vcffile.read_variant() variant = vcffile.read_variant() gt = vcffile.get_sample_gt(variant, 'NA00001', phased=False) self.assertEqual(gt, 'G/GTC')
def parse_vcf(vcf_in, fout): ''' Read through the vcf file, and parse it. Output results in tsv format ''' with vcf.VcfFile(vcf_in, 'r') as vcffile: # Skip to the variants section of the vcf file vcffile.jump2variants() sample_names = vcffile.get_sample_names() sample_names_gt = [sn + '_gt' for sn in sample_names] sample_names_dp = [sn + '_dp' for sn in sample_names] # Output header line fout.write('%s\n' % '\t'.join( ['Chrom', 'Position', 'Ref', 'Alt', 'Type', 'AF', 'NoCall'] + sample_names_gt + sample_names_dp)) # Read in the variant lines for line in vcffile: # Get parsed variant data variant = vcffile.parse_line(line) # Record columns chrom = variant['CHROM'] pos = variant['POS'] ref = variant['REF'] alt = variant['ALT'] # Parse the info column info_map, info_single = vcffile.parse_info(variant) af = info_map['AF'] # Variant type variant_type = 'snp' len_ref = len(ref) len_alt = len(alt) if len_ref > 1 or len_alt > 1: if len_ref > len_alt: variant_type = 'del' elif len_ref < len_alt: variant_type = 'ins' else: # len_ref == len_alt if len_ref == 2: variant_type = 'dnp' elif len_ref == 3: variant_type = 'tnp' else: variant_type = 'onp' # Sample genotypes samples2field2val = vcffile.parse_samples(variant) samples_gts = [] samples_dps = [] num_nocall = 0 for sample in sample_names: sample_gt = vcffile.get_sample_gt(variant, sample) if sample_gt == 'N/N': num_nocall += 1 if 'DP' not in samples2field2val[sample]: sample_dp = 'NA' else: sample_dp = samples2field2val[sample]['DP'] samples_gts.append(sample_gt) samples_dps.append(sample_dp) # Output to standard output fout.write('%s\n' % '\t'.join( [chrom, pos, ref, alt, variant_type, af, str(num_nocall)] + samples_gts + samples_dps))
def test_is_header(self): with vcf.VcfFile(self.example_vcf, 'r') as vcffile: self.assertTrue( vcffile.is_header(self.example_vcf_column_names_str)) self.assertFalse( vcffile.is_header('#' + self.example_vcf_column_names_str))
def test_is_meta(self): with vcf.VcfFile(self.example_vcf, 'r') as vcffile: self.assertTrue(vcffile.is_meta('##fileDate=20120818')) self.assertFalse(vcffile.is_meta('# blahblah')) self.assertFalse(vcffile.is_meta('foo'))