def plinkify(ds, min=None, max=None): vcf = utils.new_temp_file(prefix="plink", suffix="vcf") plinkpath = utils.new_temp_file(prefix="plink") hl.export_vcf(ds, vcf) threshold_string = "{} {}".format("--min {}".format(min) if min else "", "--max {}".format(max) if max else "") plink_command = "plink --double-id --allow-extra-chr --vcf {} --genome full --out {} {}" \ .format(utils.uri_path(vcf), utils.uri_path(plinkpath), threshold_string) result_file = utils.uri_path(plinkpath + ".genome") syscall(plink_command, shell=True, stdout=DEVNULL, stderr=DEVNULL) ### format of .genome file is: # _, fid1, iid1, fid2, iid2, rt, ez, z0, z1, z2, pihat, phe, # dst, ppc, ratio, ibs0, ibs1, ibs2, homhom, hethet (+ separated) ### format of ibd is: # i (iid1), j (iid2), ibd: {Z0, Z1, Z2, PI_HAT}, ibs0, ibs1, ibs2 results = {} with open(result_file) as f: f.readline() for line in f: row = line.strip().split() results[(row[1], row[3])] = (list(map(float, row[6:10])), list(map(int, row[14:17]))) return results
def test_import_table_force_bgz(self): f = new_temp_file(suffix=".bgz") t = hl.utils.range_table(10, 5) t.export(f) f2 = new_temp_file(suffix=".gz") run_command(["cp", uri_path(f), uri_path(f2)]) t2 = hl.import_table(f2, force_bgz=True, impute=True).key_by('idx') self.assertTrue(t._same(t2))
def test_export_plink(self): vcf_file = resource('sample.vcf') mt = hl.split_multi_hts(hl.import_vcf(vcf_file, min_partitions=10)) # permute columns so not in alphabetical order! import random indices = list(range(mt.count_cols())) random.shuffle(indices) mt = mt.choose_cols(indices) split_vcf_file = uri_path(new_temp_file()) hl_output = uri_path(new_temp_file()) plink_output = uri_path(new_temp_file()) merge_output = uri_path(new_temp_file()) hl.export_vcf(mt, split_vcf_file) hl.export_plink(mt, hl_output) run_command(["plink", "--vcf", split_vcf_file, "--make-bed", "--out", plink_output, "--const-fid", "--keep-allele-order"]) data = [] with open(uri_path(plink_output + ".bim")) as file: for line in file: row = line.strip().split() row[1] = ":".join([row[0], row[3], row[5], row[4]]) data.append("\t".join(row) + "\n") with open(plink_output + ".bim", 'w') as f: f.writelines(data) run_command(["plink", "--bfile", plink_output, "--bmerge", hl_output, "--merge-mode", "6", "--out", merge_output]) same = True with open(merge_output + ".diff") as f: for line in f: row = line.strip().split() if row != ["SNP", "FID", "IID", "NEW", "OLD"]: same = False break self.assertTrue(same)
def test_impute_sex_same_as_plink(self): import subprocess as sp ds = hl.import_vcf(resource('x-chromosome.vcf')) sex = hl.impute_sex(ds.GT, include_par=True) vcf_file = utils.uri_path(utils.new_temp_file(prefix="plink", suffix="vcf")) out_file = utils.uri_path(utils.new_temp_file(prefix="plink")) hl.export_vcf(ds, vcf_file) try: out = sp.check_output( ["plink", "--vcf", vcf_file, "--const-fid", "--check-sex", "--silent", "--out", out_file], stderr=sp.STDOUT) except sp.CalledProcessError as e: print(e.output) raise e plink_sex = hl.import_table(out_file + '.sexcheck', delimiter=' +', types={'SNPSEX': hl.tint32, 'F': hl.tfloat64}) plink_sex = plink_sex.select('IID', 'SNPSEX', 'F') plink_sex = plink_sex.select( s=plink_sex.IID, is_female=hl.cond(plink_sex.SNPSEX == 2, True, hl.cond(plink_sex.SNPSEX == 1, False, hl.null(hl.tbool))), f_stat=plink_sex.F).key_by('s') sex = sex.select(s=sex.s, is_female=sex.is_female, f_stat=sex.f_stat) self.assertTrue(plink_sex._same(sex.select_globals(), tolerance=1e-3)) ds = ds.annotate_rows(aaf=(agg.call_stats(ds.GT, ds.alleles)).AF[1]) self.assertTrue(hl.impute_sex(ds.GT)._same(hl.impute_sex(ds.GT, aaf='aaf')))
def test_export_plink(self): vcf_file = resource('sample.vcf') mt = hl.split_multi_hts(hl.import_vcf(vcf_file, min_partitions=10)) split_vcf_file = uri_path(new_temp_file()) hl_output = uri_path(new_temp_file()) plink_output = uri_path(new_temp_file()) merge_output = uri_path(new_temp_file()) hl.export_vcf(mt, split_vcf_file) hl.export_plink(mt, hl_output) run_command([ "plink", "--vcf", split_vcf_file, "--make-bed", "--out", plink_output, "--const-fid", "--keep-allele-order" ]) data = [] with open(uri_path(plink_output + ".bim")) as file: for line in file: row = line.strip().split() row[1] = ":".join([row[0], row[3], row[5], row[4]]) data.append("\t".join(row) + "\n") with open(plink_output + ".bim", 'w') as f: f.writelines(data) run_command([ "plink", "--bfile", plink_output, "--bmerge", hl_output, "--merge-mode", "6", "--out", merge_output ]) same = True with open(merge_output + ".diff") as f: for line in f: row = line.strip().split() if row != ["SNP", "FID", "IID", "NEW", "OLD"]: same = False break self.assertTrue(same)
def hail_calculation(ds): rrm = hl.realized_relationship_matrix(ds['GT']) fn = utils.new_temp_file(suffix='.tsv') rrm.export_tsv(fn) data = [] with open(utils.uri_path(fn)) as f: f.readline() for line in f: row = line.strip().split() data.append(list(map(float, row))) return np.array(data)
def test_grm(self): tolerance = 0.001 def load_id_file(path): ids = [] with hl.hadoop_open(path) as f: for l in f: r = l.strip().split('\t') self.assertEqual(len(r), 2) ids.append(r[1]) return ids def load_rel(ns, path): rel = np.zeros((ns, ns)) with hl.hadoop_open(path) as f: for i, l in enumerate(f): for j, n in enumerate(map(float, l.strip().split('\t'))): rel[i, j] = n self.assertEqual(j, i) self.assertEqual(i, ns - 1) return rel def load_grm(ns, nv, path): m = np.zeros((ns, ns)) with utils.hadoop_open(path) as f: i = 0 for l in f: row = l.strip().split('\t') self.assertEqual(int(row[2]), nv) m[int(row[0]) - 1, int(row[1]) - 1] = float(row[3]) i += 1 self.assertEqual(i, ns * (ns + 1) / 2) return m def load_bin(ns, path): m = np.zeros((ns, ns)) with utils.hadoop_open(path, 'rb') as f: for i in range(ns): for j in range(i + 1): b = f.read(4) self.assertEqual(len(b), 4) m[i, j] = unpack('<f', bytearray(b))[0] left = f.read() self.assertEqual(len(left), 0) return m b_file = utils.new_temp_file(prefix="plink") rel_file = utils.new_temp_file(prefix="test", suffix="rel") rel_id_file = utils.new_temp_file(prefix="test", suffix="rel.id") grm_file = utils.new_temp_file(prefix="test", suffix="grm") grm_bin_file = utils.new_temp_file(prefix="test", suffix="grm.bin") grm_nbin_file = utils.new_temp_file(prefix="test", suffix="grm.N.bin") dataset = self.get_dataset() n_samples = dataset.count_cols() dataset = dataset.annotate_rows(AC=agg.sum(dataset.GT.n_alt_alleles()), n_called=agg.count_where(hl.is_defined(dataset.GT))) dataset = dataset.filter_rows((dataset.AC > 0) & (dataset.AC < 2 * dataset.n_called)) dataset = dataset.filter_rows(dataset.n_called == n_samples).persist() hl.export_plink(dataset, b_file, id=dataset.s) sample_ids = [row.s for row in dataset.cols().select('s').collect()] n_variants = dataset.count_rows() self.assertGreater(n_variants, 0) grm = hl.genetic_relatedness_matrix(dataset) grm.export_id_file(rel_id_file) ############ ### rel p_file = utils.new_temp_file(prefix="plink") syscall('''plink --bfile {} --make-rel --out {}''' .format(utils.uri_path(b_file), utils.uri_path(p_file)), shell=True, stdout=DEVNULL, stderr=DEVNULL) self.assertEqual(load_id_file(p_file + ".rel.id"), sample_ids) grm.export_rel(rel_file) self.assertEqual(load_id_file(rel_id_file), sample_ids) self.assertTrue(np.allclose(load_rel(n_samples, p_file + ".rel"), load_rel(n_samples, rel_file), atol=tolerance)) ############ ### gcta-grm p_file = utils.new_temp_file(prefix="plink") syscall('''plink --bfile {} --make-grm-gz --out {}''' .format(utils.uri_path(b_file), utils.uri_path(p_file)), shell=True, stdout=DEVNULL, stderr=DEVNULL) self.assertEqual(load_id_file(p_file + ".grm.id"), sample_ids) grm.export_gcta_grm(grm_file) self.assertTrue(np.allclose(load_grm(n_samples, n_variants, p_file + ".grm.gz"), load_grm(n_samples, n_variants, grm_file), atol=tolerance)) ############ ### gcta-grm-bin p_file = utils.new_temp_file(prefix="plink") syscall('''plink --bfile {} --make-grm-bin --out {}''' .format(utils.uri_path(b_file), utils.uri_path(p_file)), shell=True, stdout=DEVNULL, stderr=DEVNULL) self.assertEqual(load_id_file(p_file + ".grm.id"), sample_ids) grm.export_gcta_grm_bin(grm_bin_file, grm_nbin_file) self.assertTrue(np.allclose(load_bin(n_samples, p_file + ".grm.bin"), load_bin(n_samples, grm_bin_file), atol=tolerance)) self.assertTrue(np.allclose(load_bin(n_samples, p_file + ".grm.N.bin"), load_bin(n_samples, grm_nbin_file), atol=tolerance))