def test_makes_df_ctrl(self):
     # This takes about 20min on v1/v2
     from idiva.clf.df import v0_df
     from idiva.io import ReadVCF
     with download(URLS['ctrl']).now.open(mode='rb') as fd:
         with open_maybe_gz(fd) as fd:
             assert isinstance(fd, io.TextIOBase)
             df = v0_df(ReadVCF(fd))
             self.assertTrue(len(df) > 0)
             self.assertEqual(len(df), ref_len_v2['ctrl'])
    def test_combines(self):
        from idiva.io import ReadVCF
        from idiva.clf.df import v0_df, join
        dfs = {}

        for k in URLS:
            with download(URLS[k]).now.open(mode='rb') as fd:
                with open_maybe_gz(fd) as fd:
                    assert isinstance(fd, io.TextIOBase)
                    dfs[k] = v0_df(ReadVCF(fd))

        df = join(case=dfs['case'], ctrl=dfs['ctrl'])
    def test_join_does_something(self):
        from idiva.io import ReadVCF
        from idiva.clf.df import v0_df, join

        dfs = {}

        for k in PATHS:
            with PATHS[k].open(mode='r') as fd:
                assert isinstance(fd, io.TextIOBase)
                dfs[k] = v0_df(ReadVCF(fd))

        df = join(case=dfs['case'], ctrl=dfs['ctrl'])
    def test_makes_df(self):
        from idiva.clf.df import v0_df
        from idiva.io import ReadVCF
        from idiva.utils import seek_then_rewind

        for k in PATHS:
            with PATHS[k].open(mode='r') as fd:
                assert isinstance(fd, io.TextIOBase)
                with seek_then_rewind(fd):
                    datalines = list(ReadVCF(fd))
                with seek_then_rewind(fd):
                    df = v0_df(ReadVCF(fd))
                self.assertEqual(len(datalines), len(df))
    def test_chi2_large_head(self):
        from idiva.io import ReadVCF, open_maybe_gz
        from idiva.clf.df import v0_df, join

        dfs = {}

        for (k, file) in PATHS_LARGE_HEAD.items():
            with open_maybe_gz(file, mode='r') as fd:
                dfs[k] = v0_df(ReadVCF(fd))

        df = join(case=dfs['case'], ctrl=dfs['ctrl'])

        cols = tuple([F"ALT{n}_{kind}" for n in range(3)]
                     for kind in ['case', 'ctrl'])

        pp = chi2_test(df[cols[0] + cols[1]], cols, add=1)
    def test_chi2_large(self):
        from idiva.io import ReadVCF, open_maybe_gz
        from idiva.clf.df import v0_df, join

        dfs = {}

        for k in URLS_LARGE:
            with download(URLS_LARGE[k]).now.open(mode='rb') as fd:
                with open_maybe_gz(fd, mode='r') as fd:
                    dfs[k] = v0_df(ReadVCF(fd))

        df = join(case=dfs['case'], ctrl=dfs['ctrl'])

        cols = tuple([F"ALT{n}_{kind}" for n in range(3)]
                     for kind in ['case', 'ctrl'])

        p = chi2_test(df[cols[0] + cols[1]], cols, add=1)
    def test_combine(self):
        from idiva.io import ReadVCF
        from idiva.io.vcf import SEP
        from idiva.clf.df import v0_df, join, dtype_v0

        dfs = {}

        for k in PATHS:
            with PATHS[k].open(mode='r') as fd:
                assert isinstance(fd, io.TextIOBase)
                dfs[k] = v0_df(ReadVCF(fd))

        candidate = join(case=dfs['case'], ctrl=dfs['ctrl'])

        def read_csv(file) -> pd.DataFrame:
            return pd.read_csv(file, sep=SEP).astype(
                {
                    'CHROM': str,
                    'POS': int,
                    'ID': str,
                    'ALT0_case': float, 'ALT1_case': float, 'ALT2_case': float,
                    'ALT0_ctrl': float, 'ALT1_ctrl': float, 'ALT2_ctrl': float,
                }
            )

        ref_file = MY_SPACE / "reference.txt"

        # Hack to create the reference
        # candidate.to_csv(ref_file, sep=SEP, index=True)

        reference = read_csv(ref_file)

        # Write and read `candidate` to make comparable to `reference`
        import tempfile
        with tempfile.NamedTemporaryFile(mode='w') as tf:
            candidate.to_csv(tf, sep=SEP, index=True)
            tf.flush()
            candidate = read_csv(tf.name)

        self.assertTrue(reference.equals(candidate))
 def df_maker1():
     # Note: v0_df is parallelized
     return join(case=v0_df(case), ctrl=v0_df(ctrl))