def test_cmp_vrt_iter_vrt2(self): vs1 = VariantSetFromFile(pkg_file('genomvar.test', 'data/example1.vcf')) vs2 = VariantSetFromFile( pkg_file('genomvar.test', 'data/example_gnomad_1.vcf.gz')) vrt = list(vs1.iter_vrt()) self.assertEqual(len(list(vs1.diff_vrt(vs2).iter_vrt())), len(vrt))
def test_cmp_vrt_iter_same(self): vs = VariantSetFromFile( pkg_file('genomvar.test', 'data/example2.vcf.gz')) tot = list(vs.find_vrt()) # print(tot) comm = list(vs.comm_vrt(vs).iter_vrt()) self.assertEqual(len(comm), len(tot))
def test_wrong_chrom_name_in_ref(self): ref = Reference(pkg_file(__name__, 'data/chr25.fasta')) vset = VariantSetFromFile(pkg_file('genomvar.test', 'data/example1.vcf.gz'), reference=ref, index=True) self.assertEqual(len(list(vset.find_vrt(rgn='chr24:1200-1210'))), 2) ref.close()
def test_diff_callback(self): s1 = VariantSetFromFile(pkg_file('genomvar.test', 'data/example3.vcf')) s2 = VariantSetFromFile(pkg_file('genomvar.test', 'data/example3.vcf')) cb = lambda m: [v.attrib['vcf_notation']['row'] for v in m] for N, vrt in enumerate(s1.comm_vrt(s2).iter_vrt(callback=cb)): self.assertEqual(vrt.attrib['vcf_notation']['row'], vrt.attrib['cmp'][0]) self.assertEqual(N, 7)
def test_cmp_stream(self): s1 = VariantSetFromFile(pkg_file('genomvar.test', 'data/example1.vcf')) s2 = VariantSetFromFile( pkg_file('genomvar.test', 'data/example2.vcf.gz')) nofv = 0 for vrt in s1.diff_vrt(s2).iter_vrt(): nofv += vrt.nof_unit_vrt() self.assertEqual(nofv, 14)
def test_index_errors(self): file = pkg_file('genomvar.test', 'data/example1.vcf') with self.assertRaises(NoIndexFoundError): vset = VariantSetFromFile(file, reference=self.chr24, index=True) vset = VariantSetFromFile(file, reference=self.chr24) with self.assertRaises(ValueError) as cm: list(vset.find_vrt('chr1', 1, 100)) error = cm.exception self.assertIn('index is required', error.args[0].lower())
def test_find_vrt(self): ivfs = VariantSetFromFile(pkg_file('genomvar.test', 'data/example2.vcf.gz'), index=True) vs = VariantSet.from_vcf( pkg_file('genomvar.test', 'data/example2.vcf.gz')) self.assertEqual( sum([v.nof_unit_vrt() for v in ivfs.find_vrt('chr24')]), sum([v.nof_unit_vrt() for v in vs.find_vrt('chr24')])) self.assertEqual(sum([v.nof_unit_vrt() for v in ivfs.iter_vrt()]), sum([v.nof_unit_vrt() for v in vs.iter_vrt()]))
def test_class(self): vset = VariantSetFromFile(pkg_file('genomvar.test', 'data/example1.vcf.gz'), parse_info=True, reference=self.chr24, parse_samples='SAMP1') # Test find_vrt and returned INFO vrt = list(vset.find_vrt('chr24', 1200, 1210)) self.assertEqual(len(vrt), 2) v1, v2 = vrt self.assertEqual(v1.attrib['info']['NSV'], 1) self.assertEqual(v2.attrib['info']['RECN'], 19) # Test multiallelic vrt = list(vset.find_vrt('chr24', 20, 30)) self.assertEqual(len(vrt), 2) v1, v2 = vrt self.assertEqual(v1.attrib['info']['AF'], 0.5) self.assertEqual(v2.attrib['info']['AF'], 0.5) # Test find_vrt precision vrt = list(vset.find_vrt('chr24', 2095, 2096)) self.assertEqual(len(vrt), 1) vrt = list(vset.find_vrt('chr24', 2098, 2100)) self.assertEqual(len(vrt), 1) # Test find all variants self.assertEqual(len(list(vset.find_vrt())), 16) # Test finding all variants self.assertEqual(len(list(vset.find_vrt())), 16)
def test_complex_info_example(self): vset = VariantSetFromFile(pkg_file('genomvar.test', 'data/example_gnomad_1.vcf.gz'), parse_info=True, index=True) checked = False for vrt in vset.find_vrt(rgn='chr1:69090-69091'): if vrt.alt != 'C': continue if not vrt.attrib['info']['MutPred_Top5features'] is None: checked = True self.assertTrue(vrt.attrib['info']['MutPred_Top5features']\ .startswith('Loss of sheet (P = 0.0817)| L')) self.assertTrue(checked)
def test_cmp_vrt_region_multisample2(self): vs1 = VariantSetFromFile(pkg_file('genomvar.test', 'data/example_1000genomes_1.vcf.gz'), parse_samples=True, index=True) vs2 = VariantSetFromFile(pkg_file('genomvar.test', 'data/example_1000genomes_2.vcf.gz'), parse_samples=True, index=True) comm = [] for vrt in vs2.comm_vrt(vs1).region(rgn='7:152134922-152436005'): comm.append(vrt) self.assertTrue(hasattr(vrt, 'attrib'), msg='False for' + str(vrt)) comm = list(vs2.comm_vrt(vs1).region(rgn='7:152134922-152436005')) self.assertGreater(len(comm), 0)
def test_cmp_vrt_iter_vrt(self): vs1 = VariantSetFromFile(pkg_file('genomvar.test', 'data/example1.vcf.gz'), parse_samples=True) vs2 = VariantSetFromFile(pkg_file('genomvar.test', 'data/example2.vcf.gz'), parse_samples=True) comm = list() for vrt in vs1.comm_vrt(vs2).iter_vrt(): comm.append(vrt) self.assertTrue(vrt.attrib['samples'], msg='Vrt {} has no samples'.format(vrt)) self.assertEqual(len(comm), 4) diff = vs1.diff_vrt(vs2).iter_vrt() self.assertEqual(len(list(diff)), 12)
def test_match(self): # REF TGG TT # 2093 2099 # vs1 CCC GG # vrt CG # r1 r2,r3 vs1 = VariantSetFromFile(pkg_file('genomvar.test', 'data/example1.vcf.gz'), index=True) vrt = factory.from_edit('chr24', 2098, 'TT', 'CG') self.assertEqual(len(vs1.match(vrt)), 1) # Test insertion vrt = factory.from_edit('chr24', 22, 'AG', 'AGG') match = vs1.match(vrt) self.assertEqual(len(match), 1)
def test_cmp_vrt_region(self): vs1 = VariantSetFromFile(pkg_file('genomvar.test', 'data/example1.vcf.gz'), parse_samples=True, parse_info=True, index=True) vs2 = VariantSetFromFile(pkg_file('genomvar.test', 'data/example2.vcf.gz'), parse_samples='SAMP1', parse_info=True, index=True) comm = list(vs1.comm_vrt(vs2).region(rgn='chr24:10040-10050')) self.assertEqual(len(comm), 2) v1, v2 = comm self.assertEqual(v1.attrib['info']['AF'], 1.0) self.assertEqual(v1.attrib['samples']['SAMP1']['GT'], (0, 1))
def test_from_variants(self): vfset = VariantSetFromFile( pkg_file('genomvar.test', 'data/example1.vcf')) vset = VariantSet.from_variants(list(vfset.iter_vrt())) vrt = list(vset.find_vrt('chr24', 1200, 1210)) self.assertEqual(len(vrt), 2) # Test error on out of reference bounds with self.assertRaises(ValueError): VariantSet.from_variants(list(vfset.iter_vrt()) + [variant.SNP('chr24', 10000000, 'C')], reference=self.chr24) # Test error on chromosome not in reference with self.assertRaises(ValueError): vs = VariantSet.from_variants(list(vfset.iter_vrt()) + [variant.SNP('chr2', 10, 'C')], reference=self.chr24)
def test_unsorted_VCF_input(self): header = [] lines = [] with open(pkg_file('genomvar.test', 'data/example1.vcf'), 'rt') as fh: for line in fh: if line.startswith('#'): header.append(line) else: lines.append(line) tf = tempfile.NamedTemporaryFile(suffix='.vcf') with open(tf.name, 'wt') as fh: fh.writelines(header) fh.writelines(reversed(lines)) vs1 = VariantSetFromFile(pkg_file('genomvar.test', 'data/example1.vcf')) vs2 = VariantSetFromFile(tf.name) with self.assertRaises(UnsortedVariantFileError): list(vs1.diff_vrt(vs2).iter_vrt())
def test_differently_sorted_chroms(self): s1 = VariantSetFromFile(pkg_file('genomvar.test', 'data/example3.vcf')) header = [] variants = {} with open(pkg_file('genomvar.test', 'data/example3.vcf')) as fh: for line in fh: if line.startswith('#'): header.append(line) else: variants.setdefault(line.split(maxsplit=1)[0],[])\ .append(line) tf = tempfile.NamedTemporaryFile(suffix='.vcf') with open(tf.name, 'wt') as fh: fh.writelines(header) for chrom in ['chr1', 'chr10', 'chr2']: fh.writelines(variants[chrom]) s2 = VariantSetFromFile(tf.name) with self.assertRaises(DifferentlySortedChromsError): list(s1.diff_vrt(s2).iter_vrt())
def test_find_vrt2(self): vset = VariantSetFromFile(pkg_file('genomvar.test', 'data/example1.vcf.gz'), reference=self.chr24, index=True) self.assertEqual(len(list(vset.find_vrt(rgn='chr24:1200-1210'))), 2) v1, v2 = list(vset.find_vrt(rgn='chr24:1200-1210')) self.assertEqual([v1.start, v1.end], [1206, 1207]) self.assertEqual([v2.start, v2.end], [1206, 1207]) self.assertEqual(len(list(vset.find_vrt(rgn='chr24:3200-3205'))), 1) v1 = list(vset.find_vrt(rgn='chr24:3200-3205'))[0] self.assertEqual([v1.start, v1.end], [3201, 3202]) self.assertEqual(len(list(vset.find_vrt(rgn='chr24:20-30'))), 2) v1, v2 = list(vset.find_vrt(rgn='chr24:20-30')) self.assertEqual([v1.start, v1.end, type(v1.base)], [23, 24, variant.Del]) self.assertEqual([v2.start, v2.end, type(v2.base)], [24, 25, variant.Ins])
def _cmp_vcf(f1, f2, out, match_partial=False, chunk=1000): """ Writes comparison of two VCF files to a specified file handle. """ info = [('VT', 1, 'String', 'Variant type'), ('whichVCF', 1, 'String', 'Which input VCF contains the variant; first, second or both'), ('ln', 1, 'Integer', 'Line number in input VCF variant originating from'), ('ln2', '.', 'Integer', 'If whichVCF is both indicates line numberin the second file')] writer = VCFWriter(info_spec=info) header = writer.get_header() out.write(header) if _isindexed(f1): vs1 = VariantSetFromFile(f1) else: warnings.warn('{} not indexed; may impact performance.'.format(f1)) vs1 = VariantSetFromFile(f1) if _isindexed(f2): vs2 = VariantSetFromFile(f2) else: warnings.warn('{} not indexed; may impact performance.'.format(f2)) vs2 = VariantSetFromFile(f2) _which = {0: 'first', 1: 'second', 2: 'both'} nof_vrt = {i: 0 for i in _which} cb = lambda m: [v.attrib['vcf_notation']['row'] for v in m] for which,vrt in vs1._cmp_vrt(vs2,action='all')\ .iter_vrt(callback=cb): nof_vrt[which] += vrt.nof_unit_vrt() if which == 0: lineno = vrt.attrib['vcf_notation'][ 'row'] + vs1.vcfreader.header_len + 1 elif which == 1: lineno = vrt.attrib['vcf_notation'][ 'row'] + vs2.vcfreader.header_len + 1 if which == 2: lineno = vrt.attrib['vcf_notation'][ 'row'] + vs1.vcfreader.header_len + 1 lineno2 = [ vs2.vcfreader.header_len + n + 1 for n in vrt.attrib['cmp'] ] vrt.attrib['info'] = {'whichVCF':_which[which], 'ln':lineno, 'ln2':lineno2 if which==2\ else None} try: row = writer.get_row(vrt) except ValueError as exc: if vrt.is_variant_instance(variant.Haplotype) \ or vrt.is_variant_instance(variant.Asterisk): continue else: raise exc try: out.write(str(row) + '\n') except BrokenPipeError: exit(1) return nof_vrt
def test_no_index(self): with self.assertRaises(NoIndexFoundError): vset = VariantSetFromFile(pkg_file('genomvar.test', 'data/example3.vcf'), index=True)
def test_find_nonexistent_chrom(self): vcf = pkg_file('genomvar.test', 'data/example_1000genomes_1.vcf.gz') vset = VariantSetFromFile(vcf, index=True) self.assertEqual(list(vset.find_vrt('chr24')), [])
def test_ctg_len_without_ref(self): vset = VariantSetFromFile(pkg_file('genomvar.test', 'data/example1.vcf.gz'), parse_samples='SAMP1', index=True) self.assertEqual(vset.chroms, {'chr24'})