Example #1
0
def process(*, case_vcf: Path, ctrl_vcf: Path, out_dir: Path):
    from idiva.io import open_maybe_gz
    from idiva.io import head

    with open_maybe_gz(case_vcf) as case_full, open_maybe_gz(
            ctrl_vcf) as ctrl_full:
        assert isinstance(case_full, io.TextIOBase)
        assert isinstance(ctrl_full, io.TextIOBase)

        with head(case_full) as case_head, head(ctrl_full) as ctrl_head:
            log.info("======================")
            log.info("Processing VCF (HEAD).")
            log.info("======================")
            post(
                process_vcf(case=ReadVCF(case_head),
                            ctrl=ReadVCF(ctrl_head),
                            out=(out_dir / "head")))

        log.info("======================")
        log.info("Processing VCF (FULL).")
        log.info("======================")
        post(
            process_vcf(case=ReadVCF(case_full),
                        ctrl=ReadVCF(ctrl_full),
                        out=(out_dir / "full")))
    def test_fisher_large_head(self):
        from idiva.io import ReadVCF, open_maybe_gz

        with open_maybe_gz(PATHS_LARGE_HEAD['case'], mode='r') as case:
            with open_maybe_gz(PATHS_LARGE_HEAD['ctrl'], mode='r') as ctrl:
                from idiva.stat.vcf_to_fisher import vcf_to_fisher
                result = vcf_to_fisher(case=ReadVCF(case), ctrl=ReadVCF(ctrl))
                df = result.df

        from idiva.utils.testing import whatsmyname
        out_dir = MY_SPACE / whatsmyname()
        print(df)
 def test_check_all(self):
     for group in URLS:
         with download(URLS[group]).now.open(mode='rb') as fd:
             with open_maybe_gz(fd) as fd:
                 from idiva.io.ass import check_all
                 for check in check_all(fd):
                     print(group, check)
    def test_md5_full(self):
        from idiva.io import ReadVCF, open_maybe_gz

        for (k, file) in PATHS_LARGE_FULL.items():
            with open_maybe_gz(file, mode='r') as fd:
                assert isinstance(fd, io.TextIOBase)
                ReadVCF(fd).md5
    def test_poc_head(self):
        """
        Proof-of-concept.
        """
        from idiva.io import ReadVCF, open_maybe_gz

        for (k, ref_file) in PATHS_LARGE_HEAD.items():
            can_file = (MY_SPACE / F"{sys._getframe().f_code.co_name}__{k}").with_suffix(".log")

            with open_maybe_gz(ref_file, mode='r') as fd_ref:
                assert isinstance(fd_ref, io.TextIOBase)
                vcf = ReadVCF(fd_ref).preload_all()

            with open(can_file, mode='w') as fd_can:
                with redirect_stdout(fd_can):
                    for (k, v) in vcf.meta.items():
                        if isinstance(v, str):
                            if (str(k).lower() == "filedate"):
                                from idiva.io.out import fileDate
                                v = fileDate
                            if (str(k).lower() == "source"):
                                from idiva.io.out import source
                                v = source
                            print(F"##{k}={v}")
                        elif isinstance(v, dict):
                            for (i, v) in v.items():
                                assert isinstance(v, dict)
                                assert v
                                p = ','.join(F"{k}={v if v is not None else '.'}" for (k, v) in v.items())
                                print(F"##{k}=<ID={i},{p}>")

                    print(F"#{SEP.join(vcf.header)}")

                    for dataline in vcf:
                        print(str(dataline))

            from idiva.io import Oneliner
            with open_maybe_gz(ref_file, mode='r') as fd_ref:
                with open_maybe_gz(can_file, mode='r') as fd_can:
                    assert isinstance(fd_ref, io.TextIOBase)
                    assert isinstance(fd_can, io.TextIOBase)
                    lines_ref = list(Oneliner(fd_ref))
                    lines_can = list(Oneliner(fd_can))
                    for (ref, can) in zip(lines_ref, lines_can):
                        if not (ref.startswith("##fileDate") or ref.startswith("##source")):
                            self.assertEqual(can, ref)
                    self.assertEqual(len(lines_can), len(lines_ref))
Example #6
0
 def open(cls, file):
     from idiva.io import open_maybe_gz
     if isinstance(file, cls):
         # This shouldn't normally happen
         log.warning(F"`file` is already a {cls.__name__}.")
         yield file
     else:
         with open_maybe_gz(file) as fd:
             yield cls(fd)
Example #7
0
 def test_reads_all_lines(self):
     for k in PATHS:
         with open_maybe_gz(PATHS[k], mode='r') as fd:
             vcf = ReadVCF(fd)
             from idiva.utils import seek_then_rewind
             with seek_then_rewind(fd, seek=None):
                 reference = len(fd.readlines())
             with vcf.rewind_when_done:
                 candidate = len(list(vcf))
             self.assertEqual(candidate, reference)
 def test_makes_df_ctrl(self):
     # This takes about 20min on v1/v2
     from idiva.clf.df import v0_df
     from idiva.io import ReadVCF
     with download(URLS['ctrl']).now.open(mode='rb') as fd:
         with open_maybe_gz(fd) as fd:
             assert isinstance(fd, io.TextIOBase)
             df = v0_df(ReadVCF(fd))
             self.assertTrue(len(df) > 0)
             self.assertEqual(len(df), ref_len_v2['ctrl'])
    def test_combines(self):
        from idiva.io import ReadVCF
        from idiva.clf.df import v0_df, join
        dfs = {}

        for k in URLS:
            with download(URLS[k]).now.open(mode='rb') as fd:
                with open_maybe_gz(fd) as fd:
                    assert isinstance(fd, io.TextIOBase)
                    dfs[k] = v0_df(ReadVCF(fd))

        df = join(case=dfs['case'], ctrl=dfs['ctrl'])
    def test_md5_head(self):
        from idiva.io import ReadVCF, open_maybe_gz
        from idiva.utils import seek_then_rewind

        for (k, file) in PATHS_LARGE_HEAD.items():
            with open_maybe_gz(file, mode='r') as fd:
                assert isinstance(fd, io.TextIOBase)
                with seek_then_rewind(fd, seek=0):
                    import hashlib
                    reference = hashlib.md5(fd.read().encode()).hexdigest()

                candidate = ReadVCF(fd).md5

                self.assertEqual(candidate, reference)
    def test_chi2_large_head(self):
        from idiva.io import ReadVCF, open_maybe_gz
        from idiva.clf.df import v0_df, join

        dfs = {}

        for (k, file) in PATHS_LARGE_HEAD.items():
            with open_maybe_gz(file, mode='r') as fd:
                dfs[k] = v0_df(ReadVCF(fd))

        df = join(case=dfs['case'], ctrl=dfs['ctrl'])

        cols = tuple([F"ALT{n}_{kind}" for n in range(3)]
                     for kind in ['case', 'ctrl'])

        pp = chi2_test(df[cols[0] + cols[1]], cols, add=1)
    def test_chi2_large(self):
        from idiva.io import ReadVCF, open_maybe_gz
        from idiva.clf.df import v0_df, join

        dfs = {}

        for k in URLS_LARGE:
            with download(URLS_LARGE[k]).now.open(mode='rb') as fd:
                with open_maybe_gz(fd, mode='r') as fd:
                    dfs[k] = v0_df(ReadVCF(fd))

        df = join(case=dfs['case'], ctrl=dfs['ctrl'])

        cols = tuple([F"ALT{n}_{kind}" for n in range(3)]
                     for kind in ['case', 'ctrl'])

        p = chi2_test(df[cols[0] + cols[1]], cols, add=1)
Example #13
0
    def open(ReadVCF, file):
        """
        If `file` is already a VCF,
        it will be opened from the beginning
        and the cursor put back after use.

        Otherwise the cursor will not
        be put back after use.
        """
        if isinstance(file, ReadVCF):
            log.warning(F"Reopening VCF: {file.fd}.")
            with file.rewind_when_done:
                file.fd.seek(file.dataline_start_pos)
                yield file
        else:
            from idiva.io import open_maybe_gz
            with open_maybe_gz(file) as fd:
                yield ReadVCF(fd)
Example #14
0
 def test_plain(self):
     with open_maybe_gz(ref_file, mode='r') as fd:
         self.assertEqual(reference, fd.read())
Example #15
0
    def test_gz(self):
        with open_maybe_gz(ref_file_gz, mode='r') as fd:
            candidate = fd.read()
            self.assertIsInstance(candidate, str)

        self.assertEqual(reference, candidate)
Example #16
0
 def test_dont_open_r_as_rb(self):
     with open(ref_file, mode='r') as fd:
         with self.assertRaises(AssertionError):
             with open_maybe_gz(fd, mode='rb'):
                 pass
Example #17
0
 def test_open_from_rb_as_rb(self):
     with open(ref_file, mode='rb') as fd:
         with open_maybe_gz(fd, mode='rb') as fd:
             with io.TextIOWrapper(fd) as fd:
                 self.assertEqual(reference, fd.read())