def maker_clinvar() -> pandas.DataFrame: from idiva.db import clinvar_open from idiva.io import ReadVCF from idiva.db.clinvar import clinvar_to_df with clinvar_open(which=which) as fd: return clinvar_to_df(ReadVCF(fd))
def test_open_read_vcf_meta(self): from idiva.db import clinvar_open from idiva.io import ReadVCF with clinvar_open(which='vcf_37') as fd: vcf = ReadVCF(fd) assert not hasattr(vcf, "sample_ids") print(vcf.header) raise NotImplementedError
def test_howto(self): from idiva.db import clinvar_open from idiva.io.vcf import ReadVCF with clinvar_open() as fd: vcf = ReadVCF(fd) vcf.meta for dataline in vcf: dataline
def test_open_read_manual(self): from idiva.db import clinvar_open with clinvar_open() as fd: self.assertIsInstance(fd, io.TextIOBase) reference = [ '##fileformat=VCFv4.1', '##fileDate=2020-11-07', '##source=ClinVar' ] candidate = [fd.readline().strip() for __ in range(3)] self.assertListEqual(reference, candidate)
def maker_clinvar() -> pd.DataFrame: """ creates the clinvar dataframe """ from idiva.db import clinvar_open from idiva.io import ReadVCF from idiva.db.clinvar import clinvar_to_df log.info('Making clinvar df.') with clinvar_open(which=clinvar_file) as fd: return clinvar_to_df(ReadVCF(fd))
def test_clinvar_df(self): from idiva.db import clinvar_open from idiva.io import ReadVCF from idiva.db.clinvar import clinvar_to_df with clinvar_open(which='vcf_37') as fd: df = clinvar_to_df(ReadVCF(fd)) self.assertEqual(len(df), REF_LENGTHS['clinvar_df']) self.assertTrue(all( df.loc[df['CLNVC'] == 'single_nucleotide_variant'])) self.assertFalse(df['CLNVC'].isnull().values.any()) self.assertTrue('OMIM_id' in df.columns)
def test_length_clinvar(self): from idiva.db import clinvar_open from idiva.io import ReadVCF from tqdm import tqdm with clinvar_open(which='vcf_37') as fd: vcf = ReadVCF(fd) for idx, line in tqdm(enumerate(vcf.datalines), postfix='reading clinvar file'): pass self.assertEqual(idx, REF_LENGTHS['clinvar_csv'])
def test_open_read_vcf_datalines(self): from idiva.db import clinvar_open from idiva.io import ReadVCF with clinvar_open(which='vcf_37') as fd: vcf = ReadVCF(fd) reference = [ "1 865568 846933 G A . . ALLELEID=824438;CLNDISDB=MedGen:CN517202;CLNDN=not_provided;CLNHGVS=NC_000001.10:g.865568G>A;CLNREVSTAT=criteria_provided,_single_submitter;CLNSIG=Uncertain_significance;CLNVC=single_nucleotide_variant;CLNVCSO=SO:0001483;GENEINFO=SAMD11:148398;MC=SO:0001583|missense_variant;ORIGIN=1", "1 865583 972363 C T . . ALLELEID=959431;CLNDISDB=MedGen:CN517202;CLNDN=not_provided;CLNHGVS=NC_000001.10:g.865583C>T;CLNREVSTAT=criteria_provided,_single_submitter;CLNSIG=Uncertain_significance;CLNVC=single_nucleotide_variant;CLNVCSO=SO:0001483;GENEINFO=SAMD11:148398;MC=SO:0001583|missense_variant;ORIGIN=1", "1 865628 789256 G A . . AF_ESP=0.00347;AF_EXAC=0.00622;AF_TGP=0.00280;ALLELEID=707587;CLNDISDB=MedGen:CN517202;CLNDN=not_provided;CLNHGVS=NC_000001.10:g.865628G>A;CLNREVSTAT=criteria_provided,_single_submitter;CLNSIG=Likely_benign;CLNVC=single_nucleotide_variant;CLNVCSO=SO:0001483;GENEINFO=SAMD11:148398;MC=SO:0001583|missense_variant;ORIGIN=1;RS=41285790", ] from idiva.io.vcf import RawDataline datalines: typing.List[RawDataline] datalines = list(at_most_n(vcf, n=len(reference))) self.assertIsInstance(datalines[0], RawDataline) candidate = list(map(str, datalines)) self.assertListEqual(reference, candidate) self.assertEqual(datalines[0].ref, 'G') self.assertEqual(datalines[1].ref, 'C') self.assertEqual(datalines[2].ref, 'G')
def test_open(self): from idiva.db import clinvar_open with clinvar_open() as fd: self.assertIsInstance(fd, io.TextIOBase)