def test_search_multi(): rsidlist = ['rs60995877'] vcffile = data_file('chr9-multi.vcf.gz') idxfile = data_file('chr9-multi.rsidx') conn = sqlite3.connect(idxfile) outlines = list(rsidx.search.search(rsidlist, conn, vcffile)) assert len(outlines) == 7 for line in outlines: assert line.split('\t')[2] == 'rs60995877'
def test_search_overlapping_variants(doheader, numlines): rsidlist = ['rs8051733'] vcffile = data_file('overlap.vcf.gz') idxfile = data_file('overlap.sqlite3') conn = sqlite3.connect(idxfile) outlines = list(rsidx.search.search(rsidlist, conn, vcffile, doheader)) assert len(outlines) == numlines assert '\trs8051733\t' in outlines[-1] assert '\trs967556605\t' not in outlines[-1]
def test_search_multiple_rsids_single_query(): for rsidlist in [['rs72634902'], ['rs145742571']]: vcffile = data_file('multiple_id.vcf.gz') idxfile = data_file('multiple_id.rsidx') conn = sqlite3.connect(idxfile) outlines = list(rsidx.search.search(rsidlist, conn, vcffile)) assert len(outlines) == 1 assert outlines[0].startswith( '1\t1900106\trs72634902;rs145742571\tT\tC,TCTC') conn.close()
def test_search_missing_rsid(capsys): rsidlist = [123456789] vcffile = data_file('chr17-sample.vcf.gz') idxfile = data_file('chr17-sample.rsidx') conn = sqlite3.connect(idxfile) outlines = list(rsidx.search.search(rsidlist, conn, vcffile)) assert len(outlines) == 0 conn.close() terminal = capsys.readouterr() assert '[rsidx::search] WARNING: no rsID matches' in terminal.err
def test_search_stdout(capsys): arglist = [ 'search', data_file('chr17-sample.vcf.gz'), data_file('chr17-sample.rsidx'), 'rs1472751972', 'rs1287502205', 'rs897983471', 'rs1172219431', 'rs189123651' ] args = rsidx.cli.get_parser().parse_args(arglist) rsidx.search.main(args) terminal = capsys.readouterr() outlines = terminal.out.strip().split('\n') assert len(outlines) == 5
def test_search_bad_rsids(): rsidlist = [ 'rs538736078', # replaced by . in VCF 'rs547329663', # replaced by bogus ID in VCF 'rs1440788236', # valid RSID not present in VCF 'rs1234497371', # valid RSID present in VCF ] vcffile = data_file('chr4-sample-corrupted-ids.vcf.gz') idxfile = data_file('chr4-sample-corrupted-ids.rsidx') conn = sqlite3.connect(idxfile) outlines = list(rsidx.search.search(rsidlist, conn, vcffile)) assert len(outlines) == 1 assert outlines[0].startswith('4\t218446\trs1234497371\tC\tCA,CAA') conn.close()
def test_search(rsidlist): vcffile = data_file('chr17-sample.vcf.gz') idxfile = data_file('chr17-sample.rsidx') conn = sqlite3.connect(idxfile) outlines = list(rsidx.search.search(rsidlist, conn, vcffile)) assert len(outlines) == 5 outdata = [line.split('\t')[:5] for line in outlines] assert sorted(outdata) == sorted( [['17', '944196', 'rs182553373', 'G', 'A'], ['17', '611663', 'rs544992196', 'T', 'C'], ['17', '1946968', 'rs1245348147', 'T', 'C'], ['17', '567599', 'rs1335948438', 'C', 'T'], ['17', '374561', 'rs1440788236', 'G', 'T']]) conn.close()
def test_search_cli(doheader, numlines, suffix): with NamedTemporaryFile(suffix=suffix) as outfile: arglist = [ 'search', data_file('chr17-sample.vcf.gz'), data_file('chr17-sample.rsidx'), '--out', outfile.name, 'rs1472751972', 'rs1287502205', 'rs897983471', 'rs1172219431', 'rs189123651' ] args = rsidx.cli.get_parser().parse_args(arglist) args.header = doheader rsidx.search.main(args) with rsidx.open(outfile.name, 'r') as fh: outlines = fh.read().strip().split('\n') assert len(outlines) == numlines
def test_index_force_reindex(capsys): with TempFileName(suffix='.rsidx') as idxfile: arglist = ['index', '--force', data_file('chr9-multi.vcf.gz'), idxfile] args = rsidx.cli.get_parser().parse_args(arglist) rsidx.index.main(args) rsidx.index.main(args) terminal = capsys.readouterr() assert ', overwriting' in terminal.err
def test_index_no_force_reindex(capsys): with TempFileName(suffix='.rsidx') as idxfile: arglist = ['index', data_file('chr9-multi.vcf.gz'), idxfile] args = rsidx.cli.get_parser().parse_args(arglist) rsidx.index.main(args) with pytest.raises(SystemExit): rsidx.index.main(args) terminal = capsys.readouterr() assert ', stubbornly refusing to proceed' in terminal.err
def test_index_multi(capsys): vcffile = data_file('chr9-multi.vcf.gz') with TempFileName(suffix='.rsidx') as idxfile, rsidx.open(vcffile, 'r') as vcffh: with sqlite3.connect(idxfile) as dbconn: rsidx.index.index(dbconn, vcffh) arglist = ['search', vcffile, idxfile, 'rs60995877'] args = rsidx.cli.get_parser().parse_args(arglist) rsidx.search.main(args) terminal = capsys.readouterr() assert terminal.out.count('\trs60995877\t') == 7
def test_index_bogus_rsids(): with NamedTemporaryFile(suffix='.sqlite3') as db: with sqlite3.connect(db.name) as dbconn: vcffile = data_file('chr4-sample-corrupted-ids.vcf.gz') with rsidx.open(vcffile, 'r') as vcffh: rsidx.index.index(dbconn, vcffh) c = dbconn.cursor() query = ('SELECT * FROM rsid_to_coord WHERE rsid IN ' '(538736078, 547329663, 1440788236, 1234497371)') results = list(c.execute(query)) assert results == [(1234497371, '4', 218446)]
def test_index_cli(mainfunc): with TempFileName(suffix='.rsidx') as idxfile: arglist = ['index', data_file('chr17-sample.vcf.gz'), idxfile] args = rsidx.cli.get_parser().parse_args(arglist) mainfunc(args) conn = sqlite3.connect(idxfile) c = conn.cursor() query = ('SELECT * FROM rsid_to_coord WHERE rsid IN ' '(548749810, 956322221)') results = list(c.execute(query)) assert sorted(results) == sorted([(548749810, '17', 1098730), (956322221, '17', 1227227)])
def test_index_multi_rsids(): with NamedTemporaryFile(suffix='.sqlite3') as db: with sqlite3.connect(db.name) as dbconn: vcffile = data_file('multiple_id.vcf.gz') with rsidx.open(vcffile, 'r') as vcffh: rsidx.index.index(dbconn, vcffh) c = dbconn.cursor() query = ('SELECT * FROM rsid_to_coord WHERE rsid IN ' '(72634902, 145742571)') results = list(c.execute(query)) assert sorted(results) == sorted([(72634902, '1', 1900106), (145742571, '1', 1900106)])
def test_index(cachesize, mmapsize): with NamedTemporaryFile(suffix='.sqlite3') as db: with sqlite3.connect(db.name) as dbconn: with rsidx.open(data_file('chr17-sample.vcf.gz'), 'r') as vcffh: rsidx.index.index(dbconn, vcffh, cache_size=cachesize, mmap_size=mmapsize, logint=10) c = dbconn.cursor() query = ('SELECT * FROM rsid_to_coord WHERE rsid IN ' '(1238461543, 1472751972)') results = list(c.execute(query)) assert sorted(results) == sorted([(1238461543, '17', 624973), (1472751972, '17', 132359)])