def process(*, case_vcf: Path, ctrl_vcf: Path, out_dir: Path): from idiva.io import open_maybe_gz from idiva.io import head with open_maybe_gz(case_vcf) as case_full, open_maybe_gz( ctrl_vcf) as ctrl_full: assert isinstance(case_full, io.TextIOBase) assert isinstance(ctrl_full, io.TextIOBase) with head(case_full) as case_head, head(ctrl_full) as ctrl_head: log.info("======================") log.info("Processing VCF (HEAD).") log.info("======================") post( process_vcf(case=ReadVCF(case_head), ctrl=ReadVCF(ctrl_head), out=(out_dir / "head"))) log.info("======================") log.info("Processing VCF (FULL).") log.info("======================") post( process_vcf(case=ReadVCF(case_full), ctrl=ReadVCF(ctrl_full), out=(out_dir / "full")))
def test_fisher_large_head(self): from idiva.io import ReadVCF, open_maybe_gz with open_maybe_gz(PATHS_LARGE_HEAD['case'], mode='r') as case: with open_maybe_gz(PATHS_LARGE_HEAD['ctrl'], mode='r') as ctrl: from idiva.stat.vcf_to_fisher import vcf_to_fisher result = vcf_to_fisher(case=ReadVCF(case), ctrl=ReadVCF(ctrl)) df = result.df from idiva.utils.testing import whatsmyname out_dir = MY_SPACE / whatsmyname() print(df)
def test_check_all(self): for group in URLS: with download(URLS[group]).now.open(mode='rb') as fd: with open_maybe_gz(fd) as fd: from idiva.io.ass import check_all for check in check_all(fd): print(group, check)
def test_md5_full(self): from idiva.io import ReadVCF, open_maybe_gz for (k, file) in PATHS_LARGE_FULL.items(): with open_maybe_gz(file, mode='r') as fd: assert isinstance(fd, io.TextIOBase) ReadVCF(fd).md5
def test_poc_head(self): """ Proof-of-concept. """ from idiva.io import ReadVCF, open_maybe_gz for (k, ref_file) in PATHS_LARGE_HEAD.items(): can_file = (MY_SPACE / F"{sys._getframe().f_code.co_name}__{k}").with_suffix(".log") with open_maybe_gz(ref_file, mode='r') as fd_ref: assert isinstance(fd_ref, io.TextIOBase) vcf = ReadVCF(fd_ref).preload_all() with open(can_file, mode='w') as fd_can: with redirect_stdout(fd_can): for (k, v) in vcf.meta.items(): if isinstance(v, str): if (str(k).lower() == "filedate"): from idiva.io.out import fileDate v = fileDate if (str(k).lower() == "source"): from idiva.io.out import source v = source print(F"##{k}={v}") elif isinstance(v, dict): for (i, v) in v.items(): assert isinstance(v, dict) assert v p = ','.join(F"{k}={v if v is not None else '.'}" for (k, v) in v.items()) print(F"##{k}=<ID={i},{p}>") print(F"#{SEP.join(vcf.header)}") for dataline in vcf: print(str(dataline)) from idiva.io import Oneliner with open_maybe_gz(ref_file, mode='r') as fd_ref: with open_maybe_gz(can_file, mode='r') as fd_can: assert isinstance(fd_ref, io.TextIOBase) assert isinstance(fd_can, io.TextIOBase) lines_ref = list(Oneliner(fd_ref)) lines_can = list(Oneliner(fd_can)) for (ref, can) in zip(lines_ref, lines_can): if not (ref.startswith("##fileDate") or ref.startswith("##source")): self.assertEqual(can, ref) self.assertEqual(len(lines_can), len(lines_ref))
def open(cls, file): from idiva.io import open_maybe_gz if isinstance(file, cls): # This shouldn't normally happen log.warning(F"`file` is already a {cls.__name__}.") yield file else: with open_maybe_gz(file) as fd: yield cls(fd)
def test_reads_all_lines(self): for k in PATHS: with open_maybe_gz(PATHS[k], mode='r') as fd: vcf = ReadVCF(fd) from idiva.utils import seek_then_rewind with seek_then_rewind(fd, seek=None): reference = len(fd.readlines()) with vcf.rewind_when_done: candidate = len(list(vcf)) self.assertEqual(candidate, reference)
def test_makes_df_ctrl(self): # This takes about 20min on v1/v2 from idiva.clf.df import v0_df from idiva.io import ReadVCF with download(URLS['ctrl']).now.open(mode='rb') as fd: with open_maybe_gz(fd) as fd: assert isinstance(fd, io.TextIOBase) df = v0_df(ReadVCF(fd)) self.assertTrue(len(df) > 0) self.assertEqual(len(df), ref_len_v2['ctrl'])
def test_combines(self): from idiva.io import ReadVCF from idiva.clf.df import v0_df, join dfs = {} for k in URLS: with download(URLS[k]).now.open(mode='rb') as fd: with open_maybe_gz(fd) as fd: assert isinstance(fd, io.TextIOBase) dfs[k] = v0_df(ReadVCF(fd)) df = join(case=dfs['case'], ctrl=dfs['ctrl'])
def test_md5_head(self): from idiva.io import ReadVCF, open_maybe_gz from idiva.utils import seek_then_rewind for (k, file) in PATHS_LARGE_HEAD.items(): with open_maybe_gz(file, mode='r') as fd: assert isinstance(fd, io.TextIOBase) with seek_then_rewind(fd, seek=0): import hashlib reference = hashlib.md5(fd.read().encode()).hexdigest() candidate = ReadVCF(fd).md5 self.assertEqual(candidate, reference)
def test_chi2_large_head(self): from idiva.io import ReadVCF, open_maybe_gz from idiva.clf.df import v0_df, join dfs = {} for (k, file) in PATHS_LARGE_HEAD.items(): with open_maybe_gz(file, mode='r') as fd: dfs[k] = v0_df(ReadVCF(fd)) df = join(case=dfs['case'], ctrl=dfs['ctrl']) cols = tuple([F"ALT{n}_{kind}" for n in range(3)] for kind in ['case', 'ctrl']) pp = chi2_test(df[cols[0] + cols[1]], cols, add=1)
def test_chi2_large(self): from idiva.io import ReadVCF, open_maybe_gz from idiva.clf.df import v0_df, join dfs = {} for k in URLS_LARGE: with download(URLS_LARGE[k]).now.open(mode='rb') as fd: with open_maybe_gz(fd, mode='r') as fd: dfs[k] = v0_df(ReadVCF(fd)) df = join(case=dfs['case'], ctrl=dfs['ctrl']) cols = tuple([F"ALT{n}_{kind}" for n in range(3)] for kind in ['case', 'ctrl']) p = chi2_test(df[cols[0] + cols[1]], cols, add=1)
def open(ReadVCF, file): """ If `file` is already a VCF, it will be opened from the beginning and the cursor put back after use. Otherwise the cursor will not be put back after use. """ if isinstance(file, ReadVCF): log.warning(F"Reopening VCF: {file.fd}.") with file.rewind_when_done: file.fd.seek(file.dataline_start_pos) yield file else: from idiva.io import open_maybe_gz with open_maybe_gz(file) as fd: yield ReadVCF(fd)
def test_plain(self): with open_maybe_gz(ref_file, mode='r') as fd: self.assertEqual(reference, fd.read())
def test_gz(self): with open_maybe_gz(ref_file_gz, mode='r') as fd: candidate = fd.read() self.assertIsInstance(candidate, str) self.assertEqual(reference, candidate)
def test_dont_open_r_as_rb(self): with open(ref_file, mode='r') as fd: with self.assertRaises(AssertionError): with open_maybe_gz(fd, mode='rb'): pass
def test_open_from_rb_as_rb(self): with open(ref_file, mode='rb') as fd: with open_maybe_gz(fd, mode='rb') as fd: with io.TextIOWrapper(fd) as fd: self.assertEqual(reference, fd.read())