def test_multi_open2(self):
        from idiva.io.vcf import ReadVCF
        with ReadVCF.open(vcf_file) as vcf:
            assert isinstance(vcf, ReadVCF)

            a = list(map(str, vcf))

            with ReadVCF.open(vcf) as vcf:
                b = list(map(str, vcf))

        self.assertListEqual(a, b)
Exemple #2
0
    def test_meta_accuracy(self):
        from idiva.io.vcf import ReadVCF
        with open(vcf_file, mode='r') as fd:
            reference = {
                "INFO": {
                    "NS": {"Number": 1, "Type": "Integer", "Description": '"Number of Samples With Data"'},
                    "DP": {"Number": 1, "Type": "Integer", "Description": '"Total Depth"'},
                    "AF": {"Number": None, "Type": "Float", "Description": '"Allele Frequency"'},
                    "AA": {"Number": 1, "Type": "String", "Description": '"Ancestral Allele"'},
                    "DB": {"Number": 0, "Type": "Flag", "Description": '"dbSNP membership, build 129"'},
                    "H2": {"Number": 0, "Type": "Flag", "Description": '"HapMap2 membership"'},
                },
                "FILTER": {
                    "q10": {"Description": '"Quality below 10"'},
                    "s50": {"Description": '"Less than 50% of samples have data"'},
                },
                "FORMAT": {
                    "GT": {"Number": 1, "Type": "String", "Description": '"Genotype"'},
                    "GQ": {"Number": 1, "Type": "Integer", "Description": '"Genotype Quality"'},
                    "DP": {"Number": 1, "Type": "Integer", "Description": '"Read Depth"'},
                    "HQ": {"Number": 2, "Type": "Integer", "Description": '"Haplotype Quality"'},
                },
                "fileformat": "VCFv4.0",
                "fileDate": "20090805",
                "source": "myImputationProgramV3.1",
                "reference": "1000GenomesPilot-NCBI36",
                "phasing": "partial",
            }

            self.assertDictEqual(ReadVCF(fd).meta, reference)
Exemple #3
0
 def test_open_read_vcf_meta(self):
     from idiva.db import clinvar_open
     from idiva.io import ReadVCF
     with clinvar_open(which='vcf_37') as fd:
         vcf = ReadVCF(fd)
         assert not hasattr(vcf, "sample_ids")
         print(vcf.header)
     raise NotImplementedError
Exemple #4
0
 def test_howto(self):
     from idiva.db import clinvar_open
     from idiva.io.vcf import ReadVCF
     with clinvar_open() as fd:
         vcf = ReadVCF(fd)
         vcf.meta
         for dataline in vcf:
             dataline
Exemple #5
0
 def samples_column(cls, fd):
     from idiva.io.vcf import parse_gt
     for dataline in ReadVCF(fd):
         for gt in dataline.samples:
             try:
                 (a, b) = parse_gt(gt)
             except:
                 raise RuntimeError(F"Could not parse genotype: {gt}")
    def test_dataline_types(self):
        from idiva.io.vcf import ReadVCF, RawDataline
        with ReadVCF.open(vcf_file) as vcf:
            candidate = first(vcf)
            self.assertIsInstance(candidate, RawDataline)

            self.assertIsInstance(candidate.pos, int)
            self.assertIsInstance(candidate.qual, float)
            self.assertIsInstance(candidate.info, str)
Exemple #7
0
    def test_dataline_types(self):
        from idiva.io.vcf import ReadVCF, RawDataline
        with open(vcf_file, mode='r') as fd:
            candidate = first(ReadVCF(fd))
            self.assertIsInstance(candidate, RawDataline)

            self.assertIsInstance(candidate.pos, int)
            self.assertIsInstance(candidate.qual, float)
            self.assertIsInstance(candidate.info, str)
Exemple #8
0
 def test_reads_all_lines(self):
     for k in PATHS:
         with open_maybe_gz(PATHS[k], mode='r') as fd:
             vcf = ReadVCF(fd)
             from idiva.utils import seek_then_rewind
             with seek_then_rewind(fd, seek=None):
                 reference = len(fd.readlines())
             with vcf.rewind_when_done:
                 candidate = len(list(vcf))
             self.assertEqual(candidate, reference)
Exemple #9
0
 def test_count(self):
     from idiva.io.vcf import ReadVCF, RawDataline
     # ref_len_v1 = {'ctrl': 2329288, 'case': 2360972}
     ref_len_v2 = {'ctrl': 2227080, 'case': 2258797}
     for group in URLS:
         with download(URLS[group]).now.open(mode='rb') as fd:
             with open_maybe_gz(fd, mode='r') as fd:
                 assert isinstance(fd, io.TextIOBase)
                 nlines = sum(1 for __ in ReadVCF(fd))
                 # print(F"Group {group} has {nlines} datalines")
                 self.assertEqual(nlines, ref_len_v2[group])
Exemple #10
0
 def test_clinvar_df(self):
     from idiva.db import clinvar_open
     from idiva.io import ReadVCF
     from idiva.db.clinvar import clinvar_to_df
     with clinvar_open(which='vcf_37') as fd:
         df = clinvar_to_df(ReadVCF(fd))
     self.assertEqual(len(df), REF_LENGTHS['clinvar_df'])
     self.assertTrue(all(
         df.loc[df['CLNVC'] == 'single_nucleotide_variant']))
     self.assertFalse(df['CLNVC'].isnull().values.any())
     self.assertTrue('OMIM_id' in df.columns)
Exemple #11
0
    def test_length_clinvar(self):
        from idiva.db import clinvar_open
        from idiva.io import ReadVCF
        from tqdm import tqdm
        with clinvar_open(which='vcf_37') as fd:
            vcf = ReadVCF(fd)
            for idx, line in tqdm(enumerate(vcf.datalines),
                                  postfix='reading clinvar file'):
                pass

        self.assertEqual(idx, REF_LENGTHS['clinvar_csv'])
Exemple #12
0
 def test_datalines_accuracy(self):
     from idiva.io.vcf import ReadVCF
     with open(vcf_file, mode='r') as fd:
         reference = [
             "20	14370	rs6054257	G	A	29	PASS	NS=3;DP=14;AF=0.5;DB;H2	GT:GQ:DP:HQ	0|0:48:1:51,51	1|0:48:8:51,51	1/1:43:5:.,.",
             "20	17330	None	T	A	3	q10	NS=3;DP=11;AF=0.017	GT:GQ:DP:HQ	0|0:49:3:58,50	0|1:3:5:65,3	0/0:41:3",
             "20	1110696	rs6040355	A	G,T	67	PASS	NS=2;DP=10;AF=0.333,0.667;AA=T;DB	GT:GQ:DP:HQ	1|2:21:6:23,27	2|1:2:0:18,2	2/2:35:4",
             "20	1230237	None	T	None	47	PASS	NS=3;DP=13;AA=T	GT:GQ:DP:HQ	0|0:54:7:56,60	0|0:48:4:51,51	0/0:61:2",
             "20	1234567	microsat1	GTCT	G,GTACT	50	PASS	NS=3;DP=9;AA=G	GT:GQ:DP	0/1:35:4	0/2:17:2	1/1:40:3",
         ]
         candidate = list(map(str, ReadVCF(fd)))
         self.assertListEqual(reference, candidate)
    def test_rewinds(self):
        from idiva.io.vcf import ReadVCF
        with ReadVCF.open(vcf_file) as vcf:
            assert isinstance(vcf, ReadVCF)

            with vcf.rewind_when_done:
                a = list(map(str, vcf))

            with vcf.rewind_when_done:
                b = list(map(str, vcf))

            self.assertListEqual(a, b)
Exemple #14
0
    def ref_alt_columns(cls, fd):
        vcf = ReadVCF(fd)
        special = {F"<{k}>" for k in vcf.meta['ALT'].keys()}

        for dataline in vcf:
            ref = dataline.ref
            alt = dataline.alt.split(',')
            if is_genomic_string(ref):
                # Cannot assume:
                # assert all(is_genomic_string(a) for a in alt)
                pass
            else:
                assert ref in special
                assert all((a in special) for a in alt)
Exemple #15
0
 def maker():
     from idiva.io.vcf import ReadVCF
     with clinvar_open(which) as fd:
         df = pd.DataFrame(data=(clinvar_datalines(ReadVCF(fd))))
         df = df[["RS", "CLNSIG"]].rename(columns={
             'RS': "ID",
             'CLNSIG': "ClnSig"
         })
         df = df[df.ID.fillna('').str.contains(r"^rs[0-9]+$")]
         df = df.groupby('ID', as_index=False)
         df = df.agg({
             'ClnSig':
             lambda s: F'"{", ".join(sorted(set(map(str, s))))}"'
         })
         return df
Exemple #16
0
    def alt_column(cls, fd):
        vcf = ReadVCF(fd)

        TCGA = {"T", "C", "G", "A"}
        special = {F"<{k}>" for k in vcf.meta['ALT'].keys()}

        for dataline in vcf:
            checks = [{
                'single nt': alt in TCGA,
                'multi nt': set(alt).issubset(TCGA),
                'special': alt in special,
            } for alt in dataline.alt.split(',')]

            if not any(any(c.values()) for c in checks):
                print(F"ALT = '{dataline.alt}' does not fit any known format.")
                print(F"REF = '{dataline.ref}'.")
                raise RuntimeError("Assumption on ALT column failed.")
Exemple #17
0
    def ref_column(cls, fd):
        vcf = ReadVCF(fd)

        TCGA = {"T", "C", "G", "A"}
        special = {F"<{k}>" for k in vcf.meta['ALT'].keys()}

        for dataline in vcf:
            assert "," not in dataline.ref

            checks = {
                'single nt': dataline.ref in TCGA,
                'multi nt': set(dataline.ref).issubset(TCGA),
                'special': (dataline.ref in special),
            }

            if not any(checks.values()):
                print(F"REF = '{dataline.ref}' does not fit any known format.")
                print(F"ALT = '{dataline.alt}'.")
                raise RuntimeError("Assumption on REF column failed.")
Exemple #18
0
def create_dbSNP_df(dbSNP_file_path: Path,
                    out_base: Path,
                    which_chrom: typing.Union[int, str] = 17) -> None:
    """
    Converts the dbSNP vcf file to a dataframe
    """
    log.info(
        f"Converting {dbSNP_file_path} to out_base / f'GRCh37_latest_dbSNP_all_chrom{which_chrom}.csv.gz"
    )
    out_path = out_base / f'GRCh37_latest_dbSNP_all_chrom{which_chrom}.csv.gz'
    print(out_path)
    assert out_base.exists()

    with open(dbSNP_file_path, mode='r') as fd:
        df = dbSNP_to_df(ReadVCF(fd),
                         which_chrom='NC' if which_chrom == '_all' else
                         f'NC_{str(which_chrom).zfill(6)}')

    df.to_csv(out_path, index=False, compression="gzip")
    if not len(df):
        log.warning(f'created dataframe is empty for chrom {which_chrom}')
Exemple #19
0
    def test_open_read_vcf_datalines(self):
        from idiva.db import clinvar_open
        from idiva.io import ReadVCF
        with clinvar_open(which='vcf_37') as fd:
            vcf = ReadVCF(fd)

            reference = [
                "1	865568	846933	G	A	.	.	ALLELEID=824438;CLNDISDB=MedGen:CN517202;CLNDN=not_provided;CLNHGVS=NC_000001.10:g.865568G>A;CLNREVSTAT=criteria_provided,_single_submitter;CLNSIG=Uncertain_significance;CLNVC=single_nucleotide_variant;CLNVCSO=SO:0001483;GENEINFO=SAMD11:148398;MC=SO:0001583|missense_variant;ORIGIN=1",
                "1	865583	972363	C	T	.	.	ALLELEID=959431;CLNDISDB=MedGen:CN517202;CLNDN=not_provided;CLNHGVS=NC_000001.10:g.865583C>T;CLNREVSTAT=criteria_provided,_single_submitter;CLNSIG=Uncertain_significance;CLNVC=single_nucleotide_variant;CLNVCSO=SO:0001483;GENEINFO=SAMD11:148398;MC=SO:0001583|missense_variant;ORIGIN=1",
                "1	865628	789256	G	A	.	.	AF_ESP=0.00347;AF_EXAC=0.00622;AF_TGP=0.00280;ALLELEID=707587;CLNDISDB=MedGen:CN517202;CLNDN=not_provided;CLNHGVS=NC_000001.10:g.865628G>A;CLNREVSTAT=criteria_provided,_single_submitter;CLNSIG=Likely_benign;CLNVC=single_nucleotide_variant;CLNVCSO=SO:0001483;GENEINFO=SAMD11:148398;MC=SO:0001583|missense_variant;ORIGIN=1;RS=41285790",
            ]

            from idiva.io.vcf import RawDataline
            datalines: typing.List[RawDataline]
            datalines = list(at_most_n(vcf, n=len(reference)))

            self.assertIsInstance(datalines[0], RawDataline)

            candidate = list(map(str, datalines))
            self.assertListEqual(reference, candidate)

            self.assertEqual(datalines[0].ref, 'G')
            self.assertEqual(datalines[1].ref, 'C')
            self.assertEqual(datalines[2].ref, 'G')
 def test_db_clf(self):
     with ReadVCF.open(URLS['case']) as case:
         result = phenomenet_classifier(case=case)
     self.assertTrue(len(result.df))
 def test_db_clf(self):
     with ReadVCF.open(URLS['case']) as case:
         result = db_classifier(case=case, ctrl=None)
     self.assertTrue(len(result.df))
 def test_sanity2(self):
     from idiva.io.vcf import ReadVCF
     with ReadVCF.open(vcf_file) as vcf:
         with ReadVCF.open(vcf) as vcf:
             list(vcf)
Exemple #23
0
 def test_sanity(self):
     from idiva.io.vcf import ReadVCF
     with open(vcf_file, mode='r') as fd:
         ReadVCF(fd)
Exemple #24
0
 def id_is_unique(cls, fd):
     import pandas as pd
     ids = [dataline.id for dataline in ReadVCF(fd)]
     assert pd.Series(ids).is_unique
Exemple #25
0
 def test_read_case(self):
     with PATHS['case'].open(mode='r') as fd:
         candidate = str(list(at_most_n(ReadVCF(fd), 10)).pop())
         reference = "17	186	rs547289895	G	A	100	PASS	AC=1;AF=0.000199681;AN=5008;NS=2504;DP=18075;EAS_AF=0;AMR_AF=0;AFR_AF=0.0008;EUR_AF=0;SAS_AF=0;AA=.|||;VT=SNP	GT	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0	0|0"
         self.assertEqual(reference, candidate)
Exemple #26
0
 def test_phenom_basic(self):
     with ReadVCF.open(URLS['case']) as case:
         result = phenomenet_classifier_basic(case=case, ctrl=None)
     self.assertTrue(len(result.df))
     log.info('passed!')
Exemple #27
0
 def test_phenom_clf(self):
     with ReadVCF.open(URLS['case']) as case, ReadVCF.open(
             URLS['ctrl']) as ctrl:
         result = phenomenet_classifier(case=case, ctrl=ctrl)
     self.assertTrue(len(result.df))
     log.info('passed!')
Exemple #28
0
 def format_is_gt(cls, fd):
     for dataline in ReadVCF(fd):
         assert (dataline.format == "GT")