def test_cmp_vrt_iter_vrt2(self): vs1 = VariantSetFromFile(pkg_file('genomvar.test', 'data/example1.vcf')) vs2 = VariantSetFromFile( pkg_file('genomvar.test', 'data/example_gnomad_1.vcf.gz')) vrt = list(vs1.iter_vrt()) self.assertEqual(len(list(vs1.diff_vrt(vs2).iter_vrt())), len(vrt))
def test_submission_with_bg(self): input_lines = open(pkg_file('GOnet', 'data/tests/genelist3.csv'), 'r').read() bg_file = open( pkg_file('GOnet', 'data/tests/CD8_cells_background_TPM10.lst'), 'r') request_data = { 'submit': ['Submit'], 'paste_data': [input_lines], 'bg_file': [bg_file], 'namespace': ['biological_process'], 'analysis_type': ['enrich'], 'output_type': ['graph'], 'csv_separator': [','], 'qvalue': [0.0001] } threads = [] seen = set() for n in range(200): bg_file.seek(0) t = Thread(target=c.post, args=(urls.reverse('GOnet-submit-form'), request_data)) t.daemon = True t.start() threads.append(t) sleep(random.random() * 10) print('>!< Spamming task', len(threads)) print('new vars', set(vars().keys()).difference(seen)) seen = set(vars().keys()) for t in threads: t.join() print('deleting threads') del threads print('sleeping......................................') sleep(100000000)
def test_wrong_chrom_name_in_ref(self): ref = Reference(pkg_file(__name__, 'data/chr25.fasta')) vset = VariantSetFromFile(pkg_file('genomvar.test', 'data/example1.vcf.gz'), reference=ref, index=True) self.assertEqual(len(list(vset.find_vrt(rgn='chr24:1200-1210'))), 2) ref.close()
def test_diff_callback(self): s1 = VariantSetFromFile(pkg_file('genomvar.test', 'data/example3.vcf')) s2 = VariantSetFromFile(pkg_file('genomvar.test', 'data/example3.vcf')) cb = lambda m: [v.attrib['vcf_notation']['row'] for v in m] for N, vrt in enumerate(s1.comm_vrt(s2).iter_vrt(callback=cb)): self.assertEqual(vrt.attrib['vcf_notation']['row'], vrt.attrib['cmp'][0]) self.assertEqual(N, 7)
def test_cmp_stream(self): s1 = VariantSetFromFile(pkg_file('genomvar.test', 'data/example1.vcf')) s2 = VariantSetFromFile( pkg_file('genomvar.test', 'data/example2.vcf.gz')) nofv = 0 for vrt in s1.diff_vrt(s2).iter_vrt(): nofv += vrt.nof_unit_vrt() self.assertEqual(nofv, 14)
def test_find_vrt(self): ivfs = VariantSetFromFile(pkg_file('genomvar.test', 'data/example2.vcf.gz'), index=True) vs = VariantSet.from_vcf( pkg_file('genomvar.test', 'data/example2.vcf.gz')) self.assertEqual( sum([v.nof_unit_vrt() for v in ivfs.find_vrt('chr24')]), sum([v.nof_unit_vrt() for v in vs.find_vrt('chr24')])) self.assertEqual(sum([v.nof_unit_vrt() for v in ivfs.iter_vrt()]), sum([v.nof_unit_vrt() for v in vs.iter_vrt()]))
class TestStreamCmp(TestCase): f1 = pkg_file('genomvar.test','data/example1.vcf.gz') f2 = pkg_file('genomvar.test','data/example2.vcf.gz') def test_cmp_vcf_files(self): def _get_info(info): if info=='.': return {} tokenized = info.split(';') kval = map(lambda i: i.split('=',maxsplit=1),tokenized) return {k:v for (k,v) in kval} out = io.StringIO() with warnings.catch_warnings(record=True): cnt = _cmp_vcf(self.f1,self.f2,out=out) self.assertEqual(cnt[0], 14) self.assertEqual(cnt[2], 4) self.assertEqual(cnt[1], 12) out.seek(0) noheader = itertools.dropwhile(lambda l: l.startswith('#'),out) rows = [VCFRow(*l.strip().split('\t')) for l in noheader] row0 = rows[0] info = _get_info(row0.INFO) self.assertEqual([row0.CHROM,row0.POS,row0.REF,row0.ALT], ['chr23',7462,'G','T']) self.assertEqual(info['whichVCF'],'second') self.assertEqual(info['ln'],'13') #last = rows[-1] info = _get_info(rows[-1].INFO) self.assertEqual(info['ln'],'30') self.assertEqual(info['ln2'],'21') def test_unsorted_VCF_input(self): header = [] lines = [] with open(pkg_file('genomvar.test','data/example1.vcf'),'rt') as fh: for line in fh: if line.startswith('#'): header.append(line) else: lines.append(line) tf = tempfile.NamedTemporaryFile(suffix='.vcf') with open(tf.name,'wt') as fh: fh.writelines(header) fh.writelines(reversed(lines)) out = io.StringIO() with warnings.catch_warnings(record=True): with self.assertRaises(UnsortedVariantFileError): _cmp_vcf(pkg_file('genomvar.test','data/example1.vcf'), tf.name,out=out)
def test_GO_annotate_genelist14(self): input_lines = open(pkg_file(__name__, 'data/genelist14.tsv'), 'r').read() custom_annotation = open( pkg_file(__name__, 'data/custom_annotation2.txt'), 'r').read() req = dict( job_req, **{ 'paste_data': input_lines, 'analysis_type': 'annot', 'slim': 'custom', 'custom_terms': custom_annotation, 'organism': 'mouse' }) resp = c.post(urls.reverse('GOnet-submit-form'), req, follow=True)
def test_GO_annotate_genelist2(self): input_lines = open(pkg_file(__name__, 'data/genelist2.tsv'), 'r').read() custom_annotation = open( pkg_file(__name__, 'data/custom_annotation.txt'), 'r').read() req = dict( job_req, **{ 'paste_data': input_lines, 'analysis_type': 'annot', 'slim': 'custom', 'custom_terms': custom_annotation }) resp = c.post(urls.reverse('GOnet-submit-form'), req, follow=True) self.assertEqual(resp.status_code, 200) sn = GOnetSubmission.objects.latest('submit_time') net_dict = json.loads(sn.network) G = cyjs.cyjs2nx(net_dict) self.assertListEqual(list(G.predecessors('P29376')), ['GO:0071300']) self.assertListEqual(list(G.predecessors('Q5TBA9')), ['GO:0016043']) self.assertListEqual(list(G.predecessors('P16403')), ['GO:0065003']) # Test node GO:0071300 (cellular response to retinoic acid) n = list( filter(lambda n: n['data']['id'] == 'GO:0071300', net_dict['elements']['nodes']))[0] self.assertEqual(n['data']['tot_gn'], len(O.get_attr('GO:0071300', 'human'))) # Test CSV response csv_resp = c.get(urls.reverse('GOnet-csv-res', args=(str(sn.id), ))) b = io.StringIO() b.write(csv_resp.content.decode()) b.seek(0) res = pd.read_csv(b, sep=',', index_col=1) self.assertIn('LTK', res.loc['GO:0032526', 'Genes']) # Test TXT response txt_resp = c.get(urls.reverse('GOnet-txt-res', args=(str(sn.id), ))) b = io.StringIO() b.write(txt_resp.content.decode()) b.seek(0) line_found = False for line in b: if line.strip().startswith('GO:0032526'): self.assertIn('LTK', line) line_found = True break self.assertTrue(line_found)
def test_cmp_vrt_iter_vrt(self): vs1 = VariantSetFromFile(pkg_file('genomvar.test', 'data/example1.vcf.gz'), parse_samples=True) vs2 = VariantSetFromFile(pkg_file('genomvar.test', 'data/example2.vcf.gz'), parse_samples=True) comm = list() for vrt in vs1.comm_vrt(vs2).iter_vrt(): comm.append(vrt) self.assertTrue(vrt.attrib['samples'], msg='Vrt {} has no samples'.format(vrt)) self.assertEqual(len(comm), 4) diff = vs1.diff_vrt(vs2).iter_vrt() self.assertEqual(len(list(diff)), 12)
def test_cmp_vrt_region_multisample2(self): vs1 = VariantSetFromFile(pkg_file('genomvar.test', 'data/example_1000genomes_1.vcf.gz'), parse_samples=True, index=True) vs2 = VariantSetFromFile(pkg_file('genomvar.test', 'data/example_1000genomes_2.vcf.gz'), parse_samples=True, index=True) comm = [] for vrt in vs2.comm_vrt(vs1).region(rgn='7:152134922-152436005'): comm.append(vrt) self.assertTrue(hasattr(vrt, 'attrib'), msg='False for' + str(vrt)) comm = list(vs2.comm_vrt(vs1).region(rgn='7:152134922-152436005')) self.assertGreater(len(comm), 0)
def test_cmp_vrt_region(self): vs1 = VariantSetFromFile(pkg_file('genomvar.test', 'data/example1.vcf.gz'), parse_samples=True, parse_info=True, index=True) vs2 = VariantSetFromFile(pkg_file('genomvar.test', 'data/example2.vcf.gz'), parse_samples='SAMP1', parse_info=True, index=True) comm = list(vs1.comm_vrt(vs2).region(rgn='chr24:10040-10050')) self.assertEqual(len(comm), 2) v1, v2 = comm self.assertEqual(v1.attrib['info']['AF'], 1.0) self.assertEqual(v1.attrib['samples']['SAMP1']['GT'], (0, 1))
def test_asterisk_variant(self): vset = VariantSet.from_vcf(pkg_file( 'genomvar.test', 'data/example_with_asterisk.vcf.gz'), parse_info=True) vrt = list(vset.find_vrt('chr1', 995507, 995515)) self.assertEqual(len(vrt), 3)
def test_init(self): reader = VCFReader(pkg_file('genomvar.test', 'data/example1.vcf')) self.assertEqual(reader.header_len, 15) dtype = reader._dtype self.assertEqual(len(dtype['format']), 1) self.assertTrue(issubclass(dtype['format']['GT']['dtype'], np.object_), msg='Got type' + str(dtype['format']['GT']['type']))
def test_minimal_VCF_definition_io(self): buf = io.StringIO() with open(pkg_file('genomvar.test', 'data/example1.vcf'), 'rt') as fh: for line in fh: if line.startswith('##fileformat') \ or line.startswith('#CHROM') \ or not line.startswith('#'): buf.write(line) buf.seek(0) reader = VCFReader(buf) outbuf = io.StringIO() writer = VCFWriter(format_spec=[RESERVED_FORMAT.GT], samples=reader.samples) variants1 = [] for vrt in reader.iter_vrt(parse_samples=True): self.assertTrue( isinstance(vrt.attrib['samples']['SAMP1']['GT'], str)) if vrt.attrib['samples']['SAMP1'].get('GT') == '0/1': vrt.attrib['samples']['SAMP1']['GT'] = (0, 1) else: vrt.attrib['samples']['SAMP1']['GT'] = None outbuf.write(str(writer.get_row(vrt))) variants1.append(vrt) variants1.sort(key=lambda v: v.start) outbuf.seek(0) variants2 = list(VCFReader(outbuf).iter_vrt()) variants2.sort(key=lambda v: v.start) for v1, v2 in zip(variants1, variants2): self.assertTrue(v1.edit_equal(v2))
def test_GO_annotate_genelist2_vs_enriched(self): input_lines = open(pkg_file(__name__, 'data/genelist2.tsv'), 'r').read() req = dict(job_req, **{'paste_data': input_lines}) resp = c.post(urls.reverse('GOnet-submit-form'), req, follow=True) enrich_job = GOnetSubmission.objects.latest('submit_time') df = enrich_job.enrich_res_df enriched_terms = df[df['q'] < enrich_job.qvalue]['term'] custom_annotation = '\n'.join(enriched_terms) req = dict( job_req, **{ 'paste_data': input_lines, 'analysis_type': 'annot', 'slim': 'custom', 'custom_terms': custom_annotation }) resp = c.post(urls.reverse('GOnet-submit-form'), req, follow=True) annot_job = GOnetSubmission.objects.latest('submit_time') G_enrich = cyjs.cyjs2nx(json.loads(enrich_job.network)) G_annot = cyjs.cyjs2nx(json.loads(annot_job.network)) self.assertSetEqual(set(G_enrich.nodes), set(G_annot.nodes)) self.assertSetEqual(set(G_enrich.edges), set(G_annot.edges))
def test_submission_default(self): input_lines = open(pkg_file(__name__, 'data/tests/genelist6.tsv'), 'r').read() request_data = { 'submit': ['Submit'], 'paste_data': [input_lines], 'namespace': ['biological_process'], 'analysis_type': ['enrich'], 'output_type': ['graph'], 'csv_separator': ['\t'], 'qvalue': [0.05] } threads = [] for n in range(200): t = Thread(target=c.post, args=(urls.reverse('GOnet-submit-form'), request_data)) t.daemon = True t.start() threads.append(t) sleep(random.random() * 10) print('>!< Spamming task', len(threads)) for t in threads: t.join() del threads print('sleeping......................................') sleep(100000000)
def test_cmp_vrt_iter_same(self): vs = VariantSetFromFile( pkg_file('genomvar.test', 'data/example2.vcf.gz')) tot = list(vs.find_vrt()) # print(tot) comm = list(vs.comm_vrt(vs).iter_vrt()) self.assertEqual(len(comm), len(tot))
def test_class(self): vset = VariantSetFromFile(pkg_file('genomvar.test', 'data/example1.vcf.gz'), parse_info=True, reference=self.chr24, parse_samples='SAMP1') # Test find_vrt and returned INFO vrt = list(vset.find_vrt('chr24', 1200, 1210)) self.assertEqual(len(vrt), 2) v1, v2 = vrt self.assertEqual(v1.attrib['info']['NSV'], 1) self.assertEqual(v2.attrib['info']['RECN'], 19) # Test multiallelic vrt = list(vset.find_vrt('chr24', 20, 30)) self.assertEqual(len(vrt), 2) v1, v2 = vrt self.assertEqual(v1.attrib['info']['AF'], 0.5) self.assertEqual(v2.attrib['info']['AF'], 0.5) # Test find_vrt precision vrt = list(vset.find_vrt('chr24', 2095, 2096)) self.assertEqual(len(vrt), 1) vrt = list(vset.find_vrt('chr24', 2098, 2100)) self.assertEqual(len(vrt), 1) # Test find all variants self.assertEqual(len(list(vset.find_vrt())), 16) # Test finding all variants self.assertEqual(len(list(vset.find_vrt())), 16)
def test_from_variants_vcf(self): vs0 = varset.VariantSet.from_vcf(pkg_file('genomvar.test', 'data/example1.vcf'), parse_info=True) variants1 = sorted(vs0.iter_vrt(), key=lambda v: v.key) vs = VariantSet.from_variants(variants1) _desc = 'Test for multinumber field' info_spec_tuples = [('DP4', 4, 'Integer', _desc), ('NSV', 1, 'Integer')] info_spec_dict = vs0.dtype['info'] for info_spec in (info_spec_tuples, info_spec_dict): tf = tempfile.NamedTemporaryFile(suffix='.vcf') with open(tf.name, 'wt') as fh: vs.to_vcf(fh, info_spec=info_spec) with open(tf.name, 'rt') as fh: self.assertIn( '##INFO=<ID=DP4,Number=4,Type=Integer,Description="{}">'\ .format(_desc), fh.read().splitlines()) fh.seek(0) # print(fh.read()) variants2 = sorted(VCFReader(tf.name).iter_vrt(parse_info=True), key=lambda v: v.key) self.assertEqual(len(variants1), len(variants2)) cnt = 0 for v1, v2 in zip(variants1, variants2): self.assertTrue(v1.edit_equal(v2)) self.assertEqual(v1.attrib['info']['NSV'], v2.attrib['info']['NSV'])
def test_GO_enrich_genelist6_long(self): input_lines = open(pkg_file(__name__, 'data/genelist6.tsv'), 'r').read() bg_file = open( pkg_file(__name__, 'data/DPOS_Mgate_Tcells_background_TPM1.lst'), 'r') req = dict( job_req, **{ 'paste_data': input_lines, 'bg_type': 'custom', 'bg_file': bg_file, 'qvalue': 0.0001 }) resp = c.post(urls.reverse('GOnet-submit-form'), req, follow=True) sn = GOnetSubmission.objects.latest('submit_time') self.assertEqual(resp.status_code, 200)
def test_resolution_mouse_Uniprot_IDs(self): genelist9 = pd.read_csv(pkg_file(__name__, 'data/genelist9.tsv'), sep='\t') genelist9 = genelist9[genelist9.Uniprot_ID != 'None'] input_lines = '\n'.join(genelist9['Uniprot_ID']) req = dict( job_req, **{ 'paste_data': input_lines, 'analysis_type': 'annot', 'slim': 'goslim_immunol', 'output_type': 'csv', 'organism': 'mouse' }) URL = urls.reverse('GOnet-submit-form') resp = c.post(URL, req, follow=True) self.assertEqual(resp.status_code, 200) sn = GOnetSubmission.objects.latest('submit_time') idmap_resp = c.get( urls.reverse('GOnet-input-idmap', args=(str(sn.id), ))) b = io.StringIO() b.write(idmap_resp.content.decode()) b.seek(0) res = pd.read_csv(b, sep='\t', index_col=0) for tup in genelist9.itertuples(): if tup.MGI_ID == 'MGI:2151253': continue self.assertEqual(res.loc[tup.Uniprot_ID, 'MGI_ID'], tup.MGI_ID)
def test_from_variants_to_vcf_with_sampdata(self): file = pkg_file('genomvar.test', 'data/example3.vcf') variants1 = sorted(VCFReader(file).iter_vrt(parse_samples=True), key=lambda v: v.key) vs = VariantSet.from_variants(variants1) tf = tempfile.NamedTemporaryFile(suffix='.vcf') with open(tf.name, 'wt') as fh: vs.to_vcf( fh, format_spec=[RESERVED_FORMAT.GT, ('AD', 'R', 'Integer', '')], samples=['SAMP1']) with open(tf.name, 'rt') as fh: fh.seek(0) self.assertIn( '##FORMAT=<ID=AD,Number=R,Type=Integer,'\ +'Description="">', fh.read().splitlines()) variants2 = sorted(VCFReader(tf.name).iter_vrt(parse_samples=True), key=lambda v: v.key) self.assertEqual(len(variants1), len(variants2)) cnt = 0 for v1, v2 in zip(variants1, variants2): self.assertTrue(v1.edit_equal(v2)) self.assertEqual(v1.attrib['samples']['SAMP1']['AD'], v2.attrib['samples']['SAMP1']['AD'])
def test_from_vcf_with_attr(self): s = VariantSet.from_vcf(pkg_file('genomvar.test', 'data/example1.vcf'), parse_info=True) _vrt = list(s.find_vrt('chr24', 150, 160)) self.assertEqual(len(_vrt), 1) vrt = _vrt[0] self.assertEqual(vrt.attrib['info']['AF'], 1.0) # Check multiallelic locus _vrt = list(s.find_vrt('chr24', 20, 30)) self.assertEqual(len(_vrt), 2) for vrt in _vrt: if not vrt.is_variant_instance(variant.Null): self.assertEqual(vrt.attrib['info']['AF'], 0.5) # Check None/KeyError cases (".",field absent...) _vrt = list( filter(lambda o: not o.is_variant_instance(variant.Null), s.find_vrt('chr24', 450, 460))) self.assertEqual(len(_vrt), 1) vrt = _vrt[0] with self.assertRaises(ValueError): vrt.attrib['info']['Randomfields'] _vrt = list( filter(lambda o: not o.is_variant_instance(variant.Null), s.find_vrt('chr24', 4750, 4760))) self.assertEqual(len(_vrt), 1) vrt = _vrt[0] self.assertEqual(vrt.attrib['info']['STR'], True)
def test_from_vcf_to_records(self): vs = VariantSet.from_vcf(pkg_file('genomvar.test', 'data/example1.vcf'), parse_info=True, parse_samples=True) self.assertEqual(vs._samples, ['SAMP1']) # Test nested dtype recs = vs.to_records(nested=True) self.assertEqual(list(recs.dtype.fields), [ 'chrom', 'start', 'end', 'ref', 'alt', 'vartype', 'phase_group', 'info', 'SAMPLES' ]) self.assertEqual( list(recs['info'].dtype.fields), ['NSV', 'AF', 'DP4', 'ECNT', 'pl', 'mt', 'RECN', 'STR']) self.assertEqual(list(recs['SAMPLES'].dtype.fields), ['SAMP1']) self.assertEqual(list(recs['SAMPLES']['SAMP1'].dtype.fields), ['GT']) # Test not nested recs = vs.to_records(nested=False) self.assertEqual(list(recs.dtype.fields), [ 'chrom', 'start', 'end', 'ref', 'alt', 'vartype', 'phase_group', 'info_NSV', 'info_AF', 'info_DP4', 'info_ECNT', 'info_pl', 'info_mt', 'info_RECN', 'info_STR', 'SAMPLES_SAMP1_GT' ])
def test_GO_annotate_invalid_term(self): input_lines = open(pkg_file(__name__, 'data/genelist2.tsv'), 'r').read() custom_annotation = open( pkg_file(__name__, 'data/custom_annotation.txt'), 'r').read() custom_annotation += 'GO:1234567' req = dict( job_req, **{ 'paste_data': input_lines, 'analysis_type': 'annot', 'slim': 'custom', 'custom_terms': custom_annotation }) resp = c.post(urls.reverse('GOnet-submit-form'), req, follow=True) self.assertContains( resp, 'Some of the custom terms provided were not found') self.assertContains(resp, 'GO:1234567')
def test_sv_types(self): with warnings.catch_warnings(record=True) as wrn: vs = VariantSet.from_vcf( pkg_file('genomvar.test', 'data/example4.vcf.gz')) warnings.simplefilter('always') self.assertEqual(vs.nof_unit_vrt(), 100) self.assertGreater(len(wrn), 1) self.assertIn('Structural', str(wrn[-1].message))
def test_sort_chroms(self): vs = VariantSet.from_vcf( pkg_file('genomvar.test', 'data/example2.vcf.gz')) vs.sort_chroms() self.assertEqual(list(vs.get_chroms()), ['chr23', 'chr24']) vs.sort_chroms(key=lambda c: 1 if c == 'chr24' else 2) self.assertEqual(list(vs.get_chroms()), ['chr24', 'chr23'])
def test_GO_annotate_genelist2(self): input_lines = open(pkg_file(__name__, 'data/genelist2.tsv'), 'r').read() input_data_df = pd.read_csv(pkg_file(__name__, 'data/genelist2.tsv'), sep='\t', header=None) req = dict( job_req, **{ 'paste_data': input_lines, 'analysis_type': 'annot', 'slim': 'goslim_immunol' }) resp = c.post(urls.reverse('GOnet-submit-form'), req, follow=True) self.assertEqual(resp.status_code, 200) sn = GOnetSubmission.objects.latest('submit_time') net = json.loads(sn.network) G = cyjs.cyjs2nx(net) self.assertTrue(G.has_edge('GO:0007165', 'P29376')) # Test recognition of user-supplied contrast values gene_nodes = filter(lambda n: not n['data']['name'].startswith('GO:'), net['elements']['nodes']) gene_nodes = list(gene_nodes) self.assertEqual(len(list(filter(lambda node: float(node['data']['expr:user_supplied'])>0, gene_nodes))), \ np.sum(input_data_df[1]>0) - 1 ) #-1 for HIST1H2AM self.assertEqual(len(list(filter(lambda node: float(node['data']['expr:user_supplied'])<0, gene_nodes))), \ np.sum(input_data_df[1]<0)) #Test CSV response csv_resp = c.get(urls.reverse('GOnet-csv-res', args=(str(sn.id), ))) res = io.StringIO() res.write(csv_resp.content.decode()) res.seek(0) res_df = pd.read_csv(res, sep=',', index_col=0) self.assertIn('GO:0007165', set(res_df['GO_term_ID'])) self.assertEqual(res_df.index[0], 1) #Test TXT response txt_resp = c.get(urls.reverse('GOnet-txt-res', args=(str(sn.id), ))) res = io.StringIO() res.write(txt_resp.content.decode()) res.seek(0) goterms = set() for line in res: goterms.add(line.split()[0]) self.assertIn('GO:0007165', goterms)
def test_empty_vcf(self): buf = io.StringIO() with open(pkg_file('genomvar.test', 'data/example1.vcf')) as fh: for line in itertools.takewhile(lambda l: l.startswith('#'), fh): buf.write(line) buf.seek(0) vs = VariantSet.from_vcf(buf) self.assertEqual(vs.nof_unit_vrt(), 0)