def test_makes_df_ctrl(self): # This takes about 20min on v1/v2 from idiva.clf.df import v0_df from idiva.io import ReadVCF with download(URLS['ctrl']).now.open(mode='rb') as fd: with open_maybe_gz(fd) as fd: assert isinstance(fd, io.TextIOBase) df = v0_df(ReadVCF(fd)) self.assertTrue(len(df) > 0) self.assertEqual(len(df), ref_len_v2['ctrl'])
def test_combines(self): from idiva.io import ReadVCF from idiva.clf.df import v0_df, join dfs = {} for k in URLS: with download(URLS[k]).now.open(mode='rb') as fd: with open_maybe_gz(fd) as fd: assert isinstance(fd, io.TextIOBase) dfs[k] = v0_df(ReadVCF(fd)) df = join(case=dfs['case'], ctrl=dfs['ctrl'])
def test_join_does_something(self): from idiva.io import ReadVCF from idiva.clf.df import v0_df, join dfs = {} for k in PATHS: with PATHS[k].open(mode='r') as fd: assert isinstance(fd, io.TextIOBase) dfs[k] = v0_df(ReadVCF(fd)) df = join(case=dfs['case'], ctrl=dfs['ctrl'])
def test_makes_df(self): from idiva.clf.df import v0_df from idiva.io import ReadVCF from idiva.utils import seek_then_rewind for k in PATHS: with PATHS[k].open(mode='r') as fd: assert isinstance(fd, io.TextIOBase) with seek_then_rewind(fd): datalines = list(ReadVCF(fd)) with seek_then_rewind(fd): df = v0_df(ReadVCF(fd)) self.assertEqual(len(datalines), len(df))
def test_chi2_large_head(self): from idiva.io import ReadVCF, open_maybe_gz from idiva.clf.df import v0_df, join dfs = {} for (k, file) in PATHS_LARGE_HEAD.items(): with open_maybe_gz(file, mode='r') as fd: dfs[k] = v0_df(ReadVCF(fd)) df = join(case=dfs['case'], ctrl=dfs['ctrl']) cols = tuple([F"ALT{n}_{kind}" for n in range(3)] for kind in ['case', 'ctrl']) pp = chi2_test(df[cols[0] + cols[1]], cols, add=1)
def test_chi2_large(self): from idiva.io import ReadVCF, open_maybe_gz from idiva.clf.df import v0_df, join dfs = {} for k in URLS_LARGE: with download(URLS_LARGE[k]).now.open(mode='rb') as fd: with open_maybe_gz(fd, mode='r') as fd: dfs[k] = v0_df(ReadVCF(fd)) df = join(case=dfs['case'], ctrl=dfs['ctrl']) cols = tuple([F"ALT{n}_{kind}" for n in range(3)] for kind in ['case', 'ctrl']) p = chi2_test(df[cols[0] + cols[1]], cols, add=1)
def test_combine(self): from idiva.io import ReadVCF from idiva.io.vcf import SEP from idiva.clf.df import v0_df, join, dtype_v0 dfs = {} for k in PATHS: with PATHS[k].open(mode='r') as fd: assert isinstance(fd, io.TextIOBase) dfs[k] = v0_df(ReadVCF(fd)) candidate = join(case=dfs['case'], ctrl=dfs['ctrl']) def read_csv(file) -> pd.DataFrame: return pd.read_csv(file, sep=SEP).astype( { 'CHROM': str, 'POS': int, 'ID': str, 'ALT0_case': float, 'ALT1_case': float, 'ALT2_case': float, 'ALT0_ctrl': float, 'ALT1_ctrl': float, 'ALT2_ctrl': float, } ) ref_file = MY_SPACE / "reference.txt" # Hack to create the reference # candidate.to_csv(ref_file, sep=SEP, index=True) reference = read_csv(ref_file) # Write and read `candidate` to make comparable to `reference` import tempfile with tempfile.NamedTemporaryFile(mode='w') as tf: candidate.to_csv(tf, sep=SEP, index=True) tf.flush() candidate = read_csv(tf.name) self.assertTrue(reference.equals(candidate))
def df_maker1(): # Note: v0_df is parallelized return join(case=v0_df(case), ctrl=v0_df(ctrl))