def test_export_rectangles(self): nd = np.arange(0, 80, dtype=float).reshape(8, 10) rects1 = [[0, 1, 0, 1], [4, 5, 7, 8]] rects2 = [[4, 5, 0, 10], [0, 8, 4, 5]] rects3 = [[0, 1, 0, 1], [1, 2, 1, 2], [2, 3, 2, 3], [3, 5, 3, 6], [3, 6, 3, 7], [3, 7, 3, 8], [4, 5, 0, 10], [0, 8, 4, 5], [0, 8, 0, 10]] for rects in [rects1, rects2, rects3]: for block_size in [3, 4, 10]: bm_uri = new_temp_file() rect_path = new_local_temp_dir() rect_uri = local_path_uri(rect_path) (BlockMatrix.from_numpy( nd, block_size=block_size).sparsify_rectangles(rects).write( bm_uri, force_row_major=True)) BlockMatrix.export_rectangles(bm_uri, rect_uri, rects) for (i, r) in enumerate(rects): file = rect_path + '/rect-' + str(i) + '_' + '-'.join( map(str, r)) expected = nd[r[0]:r[1], r[2]:r[3]] actual = np.loadtxt(file, ndmin=2) self._assert_eq(expected, actual) rect_path_bytes = new_local_temp_dir() rect_uri_bytes = local_path_uri(rect_path_bytes) BlockMatrix.export_rectangles(bm_uri, rect_uri_bytes, rects, binary=True) for (i, r) in enumerate(rects): file = rect_path_bytes + '/rect-' + str( i) + '_' + '-'.join(map(str, r)) expected = nd[r[0]:r[1], r[2]:r[3]] actual = np.reshape(np.fromfile(file), (r[1] - r[0], r[3] - r[2])) self._assert_eq(expected, actual) bm_uri = new_temp_file() rect_uri = new_temp_file() (BlockMatrix.from_numpy(nd, block_size=5).sparsify_rectangles( [[0, 1, 0, 1]]).write(bm_uri, force_row_major=True)) with self.assertRaises(FatalError) as e: BlockMatrix.export_rectangles(bm_uri, rect_uri, [[5, 6, 5, 6]]) self.assertEquals( e.msg, 'block (1, 1) missing for rectangle 0 with bounds [5, 6, 5, 6]' )
def compute_and_annotate_ld_score(ht, r2_adj, radius, out_name, overwrite): starts_and_stops = hl.linalg.utils.locus_windows(ht.locus, radius, _localize=False) # Lifted directly from https://github.com/hail-is/hail/blob/555e02d6c792263db2c3ed97db8002b489e2dacb/hail/python/hail/methods/statgen.py#L2595 # for the time being, until efficient BlockMatrix filtering gets an easier interface # This is required, as the squaring/multiplication densifies, so this re-sparsifies. r2_adj = BlockMatrix._from_java( r2_adj._jbm.filterRowIntervalsIR( Env.backend()._to_java_ir(starts_and_stops._ir), False)) l2row = r2_adj.sum(axis=0).T l2col = r2_adj.sum(axis=1) l2 = l2row + l2col + 1 l2_bm_tmp = new_temp_file() l2_tsv_tmp = new_temp_file() l2.write(l2_bm_tmp, force_row_major=True) BlockMatrix.export(l2_bm_tmp, l2_tsv_tmp) ht_scores = hl.import_table(l2_tsv_tmp, no_header=True, impute=True) ht_scores = ht_scores.add_index().rename({'f0': 'ld_score'}) ht_scores = ht_scores.key_by('idx') ht = ht.annotate(**ht_scores[ht.new_idx]).select_globals() ht.filter(hl.is_defined(ht.ld_score)).write(out_name, overwrite)
def plinkify(ds, min=None, max=None): vcf = utils.new_temp_file(prefix="plink", suffix="vcf") plinkpath = utils.new_temp_file(prefix="plink") hl.export_vcf(ds, vcf) threshold_string = "{} {}".format("--min {}".format(min) if min else "", "--max {}".format(max) if max else "") plink_command = "plink --double-id --allow-extra-chr --vcf {} --genome full --out {} {}" \ .format(utils.uri_path(vcf), utils.uri_path(plinkpath), threshold_string) result_file = utils.uri_path(plinkpath + ".genome") syscall(plink_command, shell=True, stdout=DEVNULL, stderr=DEVNULL) ### format of .genome file is: # _, fid1, iid1, fid2, iid2, rt, ez, z0, z1, z2, pihat, phe, # dst, ppc, ratio, ibs0, ibs1, ibs2, homhom, hethet (+ separated) ### format of ibd is: # i (iid1), j (iid2), ibd: {Z0, Z1, Z2, PI_HAT}, ibs0, ibs1, ibs2 results = {} with open(result_file) as f: f.readline() for line in f: row = line.strip().split() results[(row[1], row[3])] = (list(map(float, row[6:10])), list(map(int, row[14:17]))) return results
def test_import_table_force_bgz(self): f = new_temp_file(suffix=".bgz") t = hl.utils.range_table(10, 5) t.export(f) f2 = new_temp_file(suffix=".gz") run_command(["cp", uri_path(f), uri_path(f2)]) t2 = hl.import_table(f2, force_bgz=True, impute=True).key_by('idx') self.assertTrue(t._same(t2))
def generate_ld_scores_from_ld_matrix(pop_data, data_type, min_frequency=0.01, call_rate_cutoff=0.8, adj: bool = False, radius: int = 1000000, overwrite=False): # This function required a decent number of high-mem machines (with an SSD for good measure) to complete the AFR # For the rest, on 20 n1-standard-8's, 1h15m to export block matrix, 15 mins to compute LD scores per population (~$150 total) for label, pops in dict(pop_data).items(): for pop, n in pops.items(): if pop in ('nfe', 'fin', 'asj'): continue ht = hl.read_table(ld_index_path(data_type, pop, adj=adj)) ht = ht.filter((ht.pop_freq.AF >= min_frequency) & (ht.pop_freq.AF <= 1 - min_frequency) & (ht.pop_freq.AN / n >= 2 * call_rate_cutoff)).add_index(name='new_idx') indices = ht.idx.collect() r2 = BlockMatrix.read( ld_matrix_path(data_type, pop, min_frequency >= COMMON_FREQ, adj=adj)) r2 = r2.filter(indices, indices)**2 r2_adj = ((n - 1.0) / (n - 2.0)) * r2 - (1.0 / (n - 2.0)) starts_and_stops = hl.linalg.utils.locus_windows(ht.locus, radius, _localize=False) # Lifted directly from https://github.com/hail-is/hail/blob/555e02d6c792263db2c3ed97db8002b489e2dacb/hail/python/hail/methods/statgen.py#L2595 # for the time being, until efficient BlockMatrix filtering gets an easier interface r2_adj = BlockMatrix._from_java( r2_adj._jbm.filterRowIntervalsIR( Env.backend()._to_java_ir(starts_and_stops._ir), False)) l2row = r2_adj.sum(axis=0).T l2col = r2_adj.sum(axis=1) l2 = l2row + l2col + 1 l2_bm_tmp = new_temp_file() l2_tsv_tmp = new_temp_file() l2.write(l2_bm_tmp, force_row_major=True) BlockMatrix.export(l2_bm_tmp, l2_tsv_tmp) ht_scores = hl.import_table(l2_tsv_tmp, no_header=True, impute=True) ht_scores = ht_scores.add_index().rename({'f0': 'ld_score'}) ht_scores = ht_scores.key_by('idx') ht = ht.annotate(**ht_scores[ht.new_idx]).select_globals() ht.filter(hl.is_defined(ht.ld_score)).write( ld_scores_path(data_type, pop, adj), overwrite)
def test_specify_different_index_file(self): sample_file = resource('random.sample') bgen_file = resource('random.bgen') index_file = new_temp_file(suffix='idx2') index_file_map = {bgen_file: index_file} hl.index_bgen(bgen_file, index_file_map=index_file_map) mt = hl.import_bgen(bgen_file, ['GT', 'GP'], sample_file, index_file_map=index_file_map) self.assertEqual(mt.count(), (30, 10)) with self.assertRaisesRegex(FatalError, 'missing a .idx2 file extension'): index_file = new_temp_file() index_file_map = {bgen_file: index_file} hl.index_bgen(bgen_file, index_file_map=index_file_map)
def test_multi_write(): vds1 = hl.vds.read_vds( os.path.join(resource('vds'), '1kg_chr22_5_samples.vds')) to_keep = vds1.variant_data.filter_cols( vds1.variant_data.s == 'HG00187').cols() vds2 = hl.vds.filter_samples(vds1, to_keep) path1 = new_temp_file() path2 = new_temp_file() hl.vds.write_variant_datasets([vds1, vds2], [path1, path2]) assert hl.vds.read_vds(path1)._same(vds1) assert hl.vds.read_vds(path2)._same(vds2)
def test_export_rectangles(self): nd = np.arange(0, 80, dtype=float).reshape(8, 10) rects1 = [[0, 1, 0, 1], [4, 5, 7, 8]] rects2 = [[4, 5, 0, 10], [0, 8, 4, 5]] rects3 = [[0, 1, 0, 1], [1, 2, 1, 2], [2, 3, 2, 3], [3, 5, 3, 6], [3, 6, 3, 7], [3, 7, 3, 8], [4, 5, 0, 10], [0, 8, 4, 5], [0, 8, 0, 10]] for rects in [rects1, rects2, rects3]: for block_size in [3, 4, 10]: bm_uri = new_temp_file() rect_path = new_local_temp_dir() rect_uri = local_path_uri(rect_path) (BlockMatrix.from_numpy(nd, block_size=block_size) .sparsify_rectangles(rects) .write(bm_uri, force_row_major=True)) BlockMatrix.export_rectangles(bm_uri, rect_uri, rects) for (i, r) in enumerate(rects): file = rect_path + '/rect-' + str(i) + '_' + '-'.join(map(str, r)) expected = nd[r[0]:r[1], r[2]:r[3]] actual = np.loadtxt(file, ndmin = 2) self._assert_eq(expected, actual) rect_path_bytes = new_local_temp_dir() rect_uri_bytes = local_path_uri(rect_path_bytes) BlockMatrix.export_rectangles(bm_uri, rect_uri_bytes, rects, binary=True) for (i, r) in enumerate(rects): file = rect_path_bytes + '/rect-' + str(i) + '_' + '-'.join(map(str, r)) expected = nd[r[0]:r[1], r[2]:r[3]] actual = np.reshape(np.fromfile(file), (r[1] - r[0], r[3] - r[2])) self._assert_eq(expected, actual) bm_uri = new_temp_file() rect_uri = new_temp_file() (BlockMatrix.from_numpy(nd, block_size=5) .sparsify_rectangles([[0, 1, 0, 1]]) .write(bm_uri, force_row_major=True)) with self.assertRaises(FatalError) as e: BlockMatrix.export_rectangles(bm_uri, rect_uri, [[5, 6, 5, 6]]) self.assertEquals(e.msg, 'block (1, 1) missing for rectangle 0 with bounds [5, 6, 5, 6]')
def gather(ht, key, value, *fields) -> Table: """Collapse fields into key-value pairs. :func:`.gather` mimics the functionality of the `gather()` function found in R's ``tidyr`` package. This is a way to turn "wide" format data into "long" format data. Parameters ---------- ht : :class:`.Table` A Hail table. key : :obj:`str` The name of the key field in the gathered table. value : :obj:`str` The name of the value field in the gathered table. fields : variable-length args of obj:`str` Names of fields to gather in ``ht``. Returns ------- :class:`.Table` Table with original ``fields`` gathered into ``key`` and ``value`` fields.""" ht = ht.annotate( _col_val=hl.array([hl.array([field, ht[field]]) for field in fields])) ht = ht.drop(*fields) ht = ht.explode(ht['_col_val']) ht = ht.annotate(**{key: ht['_col_val'][0], value: ht['_col_val'][1]}) ht = ht.drop('_col_val') ht_tmp = new_temp_file() ht.write(ht_tmp) return hl.read_table(ht_tmp)
def test_pc_relate_against_R_truth(): mt = hl.import_vcf(resource('pc_relate_bn_input.vcf.bgz')) hail_kin = hl.pc_relate(mt.GT, 0.00, k=2).checkpoint( utils.new_temp_file(extension='ht')) r_kin = hl.import_table(resource('pc_relate_r_truth.tsv.bgz'), types={ 'i': 'struct{s:str}', 'j': 'struct{s:str}', 'kin': 'float', 'ibd0': 'float', 'ibd1': 'float', 'ibd2': 'float' }, key=['i', 'j']) assert r_kin.select("kin")._same(hail_kin.select("kin"), tolerance=1e-3, absolute=True) assert r_kin.select("ibd0")._same(hail_kin.select("ibd0"), tolerance=1.3e-2, absolute=True) assert r_kin.select("ibd1")._same(hail_kin.select("ibd1"), tolerance=2.6e-2, absolute=True) assert r_kin.select("ibd2")._same(hail_kin.select("ibd2"), tolerance=1.3e-2, absolute=True)
def test_export_gen_exprs(self): gen = hl.import_gen(resource('example.gen'), sample_file=resource('example.sample'), contig_recoding={ "01": "1" }, reference_genome='GRCh37', min_partitions=3).add_col_index().add_row_index() out1 = new_temp_file() hl.export_gen(gen, out1, id1=hl.str(gen.col_idx), id2=hl.str(gen.col_idx), missing=0.5, varid=hl.str(gen.row_idx), rsid=hl.str(gen.row_idx), gp=[0.0, 1.0, 0.0]) in1 = (hl.import_gen(out1 + '.gen', sample_file=out1 + '.sample', min_partitions=3).add_col_index().add_row_index()) self.assertTrue( in1.aggregate_entries(hl.agg.fraction( in1.GP == [0.0, 1.0, 0.0])) == 1.0) self.assertTrue( in1.aggregate_rows( hl.agg.fraction((in1.varid == hl.str(in1.row_idx)) & (in1.rsid == hl.str(in1.row_idx)))) == 1.0) self.assertTrue( in1.aggregate_cols(hl.agg.fraction( (in1.s == hl.str(in1.col_idx)))))
def test_dndarray_sum(): n_variants = 10 n_samples = 10 block_size = 3 n_blocks = 16 mt1 = hl.balding_nichols_model(n_populations=2, n_variants=n_variants, n_samples=n_samples) mt1 = mt1.select_entries(dosage=hl.float(mt1.GT.n_alt_alleles())) mt2 = hl.balding_nichols_model(n_populations=2, n_variants=n_variants, n_samples=n_samples) mt2 = mt2.select_entries(dosage=hl.float(mt2.GT.n_alt_alleles())) da1 = hl.experimental.dnd.array(mt1, 'dosage', block_size=block_size) da2 = hl.experimental.dnd.array(mt2, 'dosage', block_size=block_size) da_sum = (da1 + da2).checkpoint(new_temp_file()) assert da_sum._force_count_blocks() == n_blocks da_result = da_sum.collect() a1 = np.array(mt1.dosage.collect()).reshape(n_variants, n_samples) a2 = np.array(mt2.dosage.collect()).reshape(n_variants, n_samples) a_result = a1 + a2 assert np.array_equal(da_result, a_result)
def test_export(self): t = hl.utils.range_table(1).annotate(foo = 3) tmp_file = new_temp_file() t.export(tmp_file) with hl.hadoop_open(tmp_file, 'r') as f_in: assert f_in.read() == 'idx\tfoo\n0\t3\n'
def copmute_ldscore(ht, bm_ld, n, radius, out_name, overwrite): r2 = bm_ld**2 r2_adj = ((n - 1.0) / (n - 2.0)) * r2 - (1.0 / (n - 2.0)) # This is required, as the squaring/multiplication densifies, so this re-sparsifies. starts_and_stops = hl.linalg.utils.locus_windows(ht.locus, radius, _localize=False) r2_adj = r2_adj._sparsify_row_intervals_expr(starts_and_stops, blocks_only=False) r2_adj = r2_adj.sparsify_triangle() r2_adj = checkpoint_tmp(r2_adj) # Note that the original ld matrix is triangular l2row = checkpoint_tmp(r2_adj.sum(axis=0)).T l2col = checkpoint_tmp(r2_adj.sum(axis=1)) r2_diag = checkpoint_tmp(r2_adj.diagonal()).T l2 = l2row + l2col - r2_diag l2_bm_tmp = new_temp_file() l2_tsv_tmp = new_gs_temp_path() l2.write(l2_bm_tmp, force_row_major=True) BlockMatrix.export(l2_bm_tmp, l2_tsv_tmp) ht_scores = hl.import_table(l2_tsv_tmp, no_header=True, impute=True) ht_scores = ht_scores.add_index().rename({'f0': 'ld_score'}) ht_scores = ht_scores.key_by('idx') ht = ht.add_index() ht = ht.annotate(**ht_scores[ht.idx]).drop('idx') ht = ht.checkpoint(out_name, overwrite) return ht
def from_entry_expr(cls, entry_expr, block_size=None): """Create a block matrix using a matrix table entry expression. Examples -------- >>> mt = hl.balding_nichols_model(3, 25, 50) >>> bm = BlockMatrix.from_entry_expr(mt.GT.n_alt_alleles()) Notes ----- If any values are missing, an error message is returned. Parameters ---------- entry_expr: :class:`.Float64Expression` Entry expression for numeric matrix entries. block_size: :obj:`int`, optional Block size. Default given by :meth:`.BlockMatrix.default_block_size`. Returns ------- :class:`.BlockMatrix` """ path = new_temp_file() cls.write_from_entry_expr(entry_expr, path, block_size) return cls.read(path)
def test_export_delim(self): t = hl.utils.range_table(1).annotate(foo = 3) tmp_file = new_temp_file() t.export(tmp_file, delimiter=',') with hl.hadoop_open(tmp_file, 'r') as f_in: assert f_in.read() == 'idx,foo\n0,3\n'
def test_linear_mixed_regression_full_rank(self): x_table = hl.import_table(resource('fastlmmCov.txt'), no_header=True, impute=True).key_by('f1') y_table = hl.import_table(resource('fastlmmPheno.txt'), no_header=True, impute=True, delimiter=' ').key_by('f1') mt = hl.import_plink(bed=resource('fastlmmTest.bed'), bim=resource('fastlmmTest.bim'), fam=resource('fastlmmTest.fam'), reference_genome=None) mt = mt.annotate_cols(x=x_table[mt.col_key].f2) mt = mt.annotate_cols(y=y_table[mt.col_key].f2).cache() p_path = utils.new_temp_file() h2_fastlmm = 0.142761 h2_places = 6 beta_fastlmm = [0.012202061, 0.037718282, -0.033572693, 0.29171541, -0.045644170] pval_hail = [0.84543084, 0.57596760, 0.58788517, 1.4057279e-06, 0.46578204] mt_chr1 = mt.filter_rows(mt.locus.contig == '1') model, _ = hl.linear_mixed_model(y=mt_chr1.y, x=[1, mt_chr1.x], z_t=mt_chr1.GT.n_alt_alleles(), p_path=p_path) model.fit() self.assertAlmostEqual(model.h_sq, h2_fastlmm, places=h2_places) mt_chr3 = mt.filter_rows((mt.locus.contig == '3') & (mt.locus.position < 2005)) mt_chr3 = mt_chr3.annotate_rows(stats=hl.agg.stats(mt_chr3.GT.n_alt_alleles())) ht = hl.linear_mixed_regression_rows((mt_chr3.GT.n_alt_alleles() - mt_chr3.stats.mean) / mt_chr3.stats.stdev, model) assert np.allclose(ht.beta.collect(), beta_fastlmm) assert np.allclose(ht.p_value.collect(), pval_hail)
def test_linear_mixed_regression_low_rank(self): x_table = hl.import_table(resource('fastlmmCov.txt'), no_header=True, impute=True).key_by('f1') y_table = hl.import_table(resource('fastlmmPheno.txt'), no_header=True, impute=True, delimiter=' ').key_by('f1') mt = hl.import_plink(bed=resource('fastlmmTest.bed'), bim=resource('fastlmmTest.bim'), fam=resource('fastlmmTest.fam'), reference_genome=None) mt = mt.annotate_cols(x=x_table[mt.col_key].f2) mt = mt.annotate_cols(y=y_table[mt.col_key].f2).cache() p_path = utils.new_temp_file() h2_hail = 0.10001626 beta_hail = [0.0073201542, 0.039969148, -0.036727875, 0.29852363, -0.049212500] pval_hail = [0.90685162, 0.54839177, 0.55001054, 9.85247263e-07, 0.42796507] mt_chr1 = mt.filter_rows((mt.locus.contig == '1') & (mt.locus.position < 200)) model, _ = hl.linear_mixed_model(y=mt_chr1.y, x=[1, mt_chr1.x], z_t=mt_chr1.GT.n_alt_alleles(), p_path=p_path) model.fit() self.assertTrue(model.low_rank) self.assertAlmostEqual(model.h_sq, h2_hail) mt_chr3 = mt.filter_rows((mt.locus.contig == '3') & (mt.locus.position < 2005)) mt_chr3 = mt_chr3.annotate_rows(stats=hl.agg.stats(mt_chr3.GT.n_alt_alleles())) ht = hl.linear_mixed_regression_rows((mt_chr3.GT.n_alt_alleles() - mt_chr3.stats.mean) / mt_chr3.stats.stdev, model) assert np.allclose(ht.beta.collect(), beta_hail) assert np.allclose(ht.p_value.collect(), pval_hail)
def test_stage_locally(self): nd = np.arange(0, 80, dtype=float).reshape(8, 10) bm_uri = new_temp_file() BlockMatrix.from_numpy(nd, block_size=3).write(bm_uri, stage_locally=True) bm = BlockMatrix.read(bm_uri) self._assert_eq(nd, bm)
def test_not_identical_headers(self): t = new_temp_file('vcf') mt = hl.import_vcf(resource('sample.vcf')) hl.export_vcf(mt.filter_cols((mt.s != "C1048::HG02024") & (mt.s != "HG00255")), t) with self.assertRaisesRegex(FatalError, 'invalid sample IDs'): (hl.import_vcf([resource('sample.vcf'), t]) ._force_count_rows())
def test_memory_issue_from_9009(): mt = hl.utils.range_matrix_table(1000, 1, n_partitions=1) mt = mt.annotate_entries(x=hl.float(mt.row_idx * mt.col_idx)) mt = mt.annotate_rows(big=hl.zeros(100_000_000)) try: hl.linalg.BlockMatrix.write_from_entry_expr(mt.x, new_temp_file(), overwrite=True) except Exception: assert False
def test_rvd_key_write(self): tempfile = new_temp_file(suffix='ht') ht1 = hl.utils.range_table(1).key_by(foo='a', bar='b') ht1.write(tempfile) # write ensures that table is written with both key fields ht1 = hl.read_table(tempfile) ht2 = hl.utils.range_table(1).annotate(foo='a') assert ht2.annotate(x = ht1.key_by('foo')[ht2.foo])._force_count() == 1
def test_export_plink(self): vcf_file = resource('sample.vcf') mt = hl.split_multi_hts(hl.import_vcf(vcf_file, min_partitions=10)) # permute columns so not in alphabetical order! import random indices = list(range(mt.count_cols())) random.shuffle(indices) mt = mt.choose_cols(indices) split_vcf_file = uri_path(new_temp_file()) hl_output = uri_path(new_temp_file()) plink_output = uri_path(new_temp_file()) merge_output = uri_path(new_temp_file()) hl.export_vcf(mt, split_vcf_file) hl.export_plink(mt, hl_output) run_command(["plink", "--vcf", split_vcf_file, "--make-bed", "--out", plink_output, "--const-fid", "--keep-allele-order"]) data = [] with open(uri_path(plink_output + ".bim")) as file: for line in file: row = line.strip().split() row[1] = ":".join([row[0], row[3], row[5], row[4]]) data.append("\t".join(row) + "\n") with open(plink_output + ".bim", 'w') as f: f.writelines(data) run_command(["plink", "--bfile", plink_output, "--bmerge", hl_output, "--merge-mode", "6", "--out", merge_output]) same = True with open(merge_output + ".diff") as f: for line in f: row = line.strip().split() if row != ["SNP", "FID", "IID", "NEW", "OLD"]: same = False break self.assertTrue(same)
def test_read_back_same_as_exported(self): t, _ = create_all_values_datasets() tmp_file = new_temp_file(prefix="test", suffix=".tsv") t.export(tmp_file) t_read_back = hl.import_table(tmp_file, types=dict(t.row.dtype)).key_by('idx') self.assertTrue(t.select_globals()._same(t_read_back, tolerance=1e-4, absolute=True))
def spread(ht, field, value, key=None) -> Table: """Spread a key-value pair of fields across multiple fields. :func:`.spread` mimics the functionality of the `spread()` function in R's `tidyr` package. This is a way to turn "long" format data into "wide" format data. Given a ``field``, :func:`.spread` will create a new table by grouping ``ht`` by its row key and, optionally, any additional fields passed to the ``key`` argument. After collapsing ``ht`` by these keys, :func:`.spread` creates a new row field for each unique value of ``field``, where the row field values are given by the corresponding ``value`` in the original ``ht``. Parameters ---------- ht : :class:`.Table` A Hail table. field : :obj:`str` The name of the factor field in `ht`. value : :obj:`str` The name of the value field in `ht`. key : optional, obj:`str` or list of :obj:`str` The name of any fields to group by, in addition to the row key fields of ``ht``. Returns ------- :class:`.Table` Table with original ``key`` and ``value`` fields spread across multiple columns.""" if key is None: key = list(ht.key) else: key = wrap_to_list(key) key = list(ht.key) + key field_vals = list(ht.aggregate(hl.agg.collect_as_set(ht[field]))) ht = (ht.group_by(*key).aggregate( **{ rv: hl.agg.take(ht[rv], 1)[0] for rv in ht.row_value if rv not in set(key + [field, value]) }, **{ fv: hl.agg.filter( ht[field] == fv, hl.rbind( hl.agg.take(ht[value], 1), lambda take: hl.cond(hl.len(take) > 0, take[0], 'NA'))) for fv in field_vals })) ht_tmp = new_temp_file() ht.write(ht_tmp) return ht
def test_not_identical_headers(self): t = new_temp_file('vcf') mt = hl.import_vcf(resource('sample.vcf')) hl.export_vcf( mt.filter_cols((mt.s != "C1048::HG02024") & (mt.s != "HG00255")), t) with self.assertRaisesRegex(FatalError, 'invalid sample IDs'): (hl.import_vcf([resource('sample.vcf'), t])._force_count_rows())
def test_impute_sex_same_as_plink(self): import subprocess as sp ds = hl.import_vcf(resource('x-chromosome.vcf')) sex = hl.impute_sex(ds.GT, include_par=True) vcf_file = utils.uri_path(utils.new_temp_file(prefix="plink", suffix="vcf")) out_file = utils.uri_path(utils.new_temp_file(prefix="plink")) hl.export_vcf(ds, vcf_file) try: out = sp.check_output( ["plink", "--vcf", vcf_file, "--const-fid", "--check-sex", "--silent", "--out", out_file], stderr=sp.STDOUT) except sp.CalledProcessError as e: print(e.output) raise e plink_sex = hl.import_table(out_file + '.sexcheck', delimiter=' +', types={'SNPSEX': hl.tint32, 'F': hl.tfloat64}) plink_sex = plink_sex.select('IID', 'SNPSEX', 'F') plink_sex = plink_sex.select( s=plink_sex.IID, is_female=hl.cond(plink_sex.SNPSEX == 2, True, hl.cond(plink_sex.SNPSEX == 1, False, hl.null(hl.tbool))), f_stat=plink_sex.F).key_by('s') sex = sex.select(s=sex.s, is_female=sex.is_female, f_stat=sex.f_stat) self.assertTrue(plink_sex._same(sex.select_globals(), tolerance=1e-3)) ds = ds.annotate_rows(aaf=(agg.call_stats(ds.GT, ds.alleles)).AF[1]) self.assertTrue(hl.impute_sex(ds.GT)._same(hl.impute_sex(ds.GT, aaf='aaf')))
def test_write_overwrite(self): path = new_temp_file() bm = BlockMatrix.from_numpy(np.array([[0]])) bm.write(path) self.assertRaises(FatalError, lambda: bm.write(path)) bm2 = BlockMatrix.from_numpy(np.array([[1]])) bm2.write(path, overwrite=True) self._assert_eq(BlockMatrix.read(path), bm2)
def test_write_from_entry_expr_overwrite(self): mt = hl.balding_nichols_model(1, 1, 1) mt = mt.select_entries(x=mt.GT.n_alt_alleles()) bm = BlockMatrix.from_entry_expr(mt.x) path = new_temp_file() BlockMatrix.write_from_entry_expr(mt.x, path) self.assertRaises(FatalError, lambda: BlockMatrix.write_from_entry_expr(mt.x, path)) BlockMatrix.write_from_entry_expr(mt.x, path, overwrite=True) self._assert_eq(BlockMatrix.read(path), bm) # non-field expressions currently take a separate code path path2 = new_temp_file() BlockMatrix.write_from_entry_expr(mt.x + 1, path2) self.assertRaises(FatalError, lambda: BlockMatrix.write_from_entry_expr(mt.x + 1, path2)) BlockMatrix.write_from_entry_expr(mt.x + 2, path2, overwrite=True) self._assert_eq(BlockMatrix.read(path2), bm + 2)
def test_export_plink(self): vcf_file = resource('sample.vcf') mt = hl.split_multi_hts(hl.import_vcf(vcf_file, min_partitions=10)) split_vcf_file = uri_path(new_temp_file()) hl_output = uri_path(new_temp_file()) plink_output = uri_path(new_temp_file()) merge_output = uri_path(new_temp_file()) hl.export_vcf(mt, split_vcf_file) hl.export_plink(mt, hl_output) run_command([ "plink", "--vcf", split_vcf_file, "--make-bed", "--out", plink_output, "--const-fid", "--keep-allele-order" ]) data = [] with open(uri_path(plink_output + ".bim")) as file: for line in file: row = line.strip().split() row[1] = ":".join([row[0], row[3], row[5], row[4]]) data.append("\t".join(row) + "\n") with open(plink_output + ".bim", 'w') as f: f.writelines(data) run_command([ "plink", "--bfile", plink_output, "--bmerge", hl_output, "--merge-mode", "6", "--out", merge_output ]) same = True with open(merge_output + ".diff") as f: for line in f: row = line.strip().split() if row != ["SNP", "FID", "IID", "NEW", "OLD"]: same = False break self.assertTrue(same)
def hail_calculation(ds): rrm = hl.realized_relationship_matrix(ds['GT']) fn = utils.new_temp_file(suffix='.tsv') rrm.export_tsv(fn) data = [] with open(utils.uri_path(fn)) as f: f.readline() for line in f: row = line.strip().split() data.append(list(map(float, row))) return np.array(data)
def test_conversion_equivalence(): gvcfs = [ os.path.join(resource('gvcfs'), '1kg_chr22', path) for path in [ 'HG00187.hg38.g.vcf.gz', 'HG00190.hg38.g.vcf.gz', 'HG00308.hg38.g.vcf.gz', 'HG00313.hg38.g.vcf.gz', 'HG00320.hg38.g.vcf.gz' ] ] tmpdir = new_temp_file() mt_path = new_temp_file() vds_path = new_temp_file() hl.experimental.run_combiner( gvcfs, mt_path, tmpdir, use_exome_default_intervals=True, reference_genome='GRCh38', overwrite=True, intervals=[hl.eval(hl.parse_locus_interval('chr22', 'GRCh38'))], key_by_locus_and_alleles=True) svcr = hl.read_matrix_table(mt_path) vds = hl.vds.VariantDataset.from_merged_representation(svcr).checkpoint( vds_path) ref = vds.reference_data var = vds.variant_data assert svcr.aggregate_entries(hl.agg.count_where(hl.is_defined( svcr.END))) == ref.aggregate_entries(hl.agg.count()) assert svcr.aggregate_entries(hl.agg.count()) == ref.aggregate_entries( hl.agg.count()) + var.aggregate_entries(hl.agg.count()) svcr_readback = hl.vds.to_merged_sparse_mt(vds) assert svcr._same(svcr_readback)
def test_linear_mixed_model_function(self): n, f, m = 4, 2, 3 y = np.array([0.0, 1.0, 8.0, 9.0]) x = np.array([[1.0, 0.0], [1.0, 2.0], [1.0, 1.0], [1.0, 4.0]]) z = np.array([[0.0, 0.0, 1.0], [0.0, 1.0, 2.0], [1.0, 2.0, 0.0], [2.0, 0.0, 1.0]]) p_path = utils.new_temp_file() def make_call(gt): if gt == 0.0: return hl.Call([0, 0]) if gt == 1.0: return hl.Call([0, 1]) if gt == 2.0: return hl.Call([1, 1]) data = [{'v': j, 's': i, 'y': y[i], 'x1': x[i, 1], 'zt': make_call(z[i, j])} for i in range(n) for j in range(m)] ht = hl.Table.parallelize(data, hl.dtype('struct{v: int32, s: int32, y: float64, x1: float64, zt: tcall}')) mt = ht.to_matrix_table(row_key=['v'], col_key=['s'], col_fields=['x1', 'y']) colsort = np.argsort(mt.key_cols_by().s.collect()).tolist() mt = mt.choose_cols(colsort) rrm = hl.realized_relationship_matrix(mt.zt).to_numpy() # kinship path agrees with from_kinship model, p = hl.linear_mixed_model(mt.y, [1, mt.x1], k=rrm, p_path=p_path, overwrite=True) model0, p0 = LinearMixedModel.from_kinship(y, x, rrm, p_path, overwrite=True) assert model0._same(model) assert np.allclose(p0, p) # random effects path with standardize=True agrees with low-rank rrm s0, u0 = np.linalg.eigh(rrm) s0 = np.flip(s0, axis=0)[:m] p0 = np.fliplr(u0).T[:m, :] model, p = hl.linear_mixed_model(mt.y, [1, mt.x1], z_t=mt.zt.n_alt_alleles(), p_path=p_path, overwrite=True) model0 = LinearMixedModel(p0 @ y, p0 @ x, s0, y, x, p_path=p_path) assert model0._same(model) # random effects path with standardize=False agrees with from_random_effects model0, p0 = LinearMixedModel.from_random_effects(y, x, z, p_path, overwrite=True) model, p = hl.linear_mixed_model(mt.y, [1, mt.x1], z_t=mt.zt.n_alt_alleles(), p_path=p_path, overwrite=True, standardize=False) assert model0._same(model) assert np.allclose(p0, p.to_numpy())
def separate(ht, field, into, delim) -> Table: """Separate a field into multiple fields by splitting on a delimiter character or position. :func:`.separate` mimics the functionality of the `separate()` function in R's ``tidyr`` package. This function will create a new table where ``field`` has been split into multiple new fields, whose names are given by ``into``. If ``delim`` is a ``str`` (including regular expression strings), ``field`` will be separated into columns by that string. In this case, the length of ``into`` must match the number of resulting fields. If ``delim`` is an ``int``, ``field`` will be separated into two row fields, where the first field contains the first ``delim`` characters of ``field`` and the second field contains the remaining characters. Parameters ---------- ht : :class:`.Table` A Hail table. field : :obj:`str` The name of the field to separate in ``ht``. into : list of :obj:`str` The names of the fields to create by separating ``field``. delimiter : :obj:`str` or :obj:`int` The character or position by which to separate ``field``. Returns ------- :class:`.Table` Table with original ``field`` split into fields whose names are defined by `into`.""" if isinstance(delim, int): ht = ht.annotate(**{ into[0]: ht[field][:delim], into[1]: ht[field][delim:] }) else: split = ht[field].split(delim) ht = ht.annotate(**{into[i]: split[i] for i in range(len(into))}) ht = ht.drop(field) ht_tmp = new_temp_file() ht.write(ht_tmp) return ht
def from_matrix_table(cls, entry_expr, path=None, block_size=None): if not path: path = new_temp_file(suffix="bm") if not block_size: block_size = cls.default_block_size() source = entry_expr._indices.source if not isinstance(source, MatrixTable): raise ValueError("Expect an expression of 'MatrixTable', found {}".format( "expression of '{}'".format(source.__class__) if source is not None else 'scalar expression')) mt = source base, _ = mt._process_joins(entry_expr) analyze('block_matrix_from_expr', entry_expr, mt._entry_indices) mt._jvds.writeBlockMatrix(path, to_expr(entry_expr)._ast.to_hql(), block_size) return cls.read(path)
def test_from_entry_expr(self): mt = get_dataset() mt = mt.annotate_entries(x=hl.or_else(mt.GT.n_alt_alleles(), 0)).cache() a1 = BlockMatrix.from_entry_expr(hl.or_else(mt.GT.n_alt_alleles(), 0), block_size=32).to_numpy() a2 = BlockMatrix.from_entry_expr(mt.x, block_size=32).to_numpy() a3 = BlockMatrix.from_entry_expr(hl.float64(mt.x), block_size=32).to_numpy() self._assert_eq(a1, a2) self._assert_eq(a1, a3) path = new_temp_file() BlockMatrix.write_from_entry_expr(mt.x, path, block_size=32) a4 = BlockMatrix.read(path).to_numpy() self._assert_eq(a1, a4)
def test_export_gen_exprs(self): gen = hl.import_gen(resource('example.gen'), sample_file=resource('example.sample'), contig_recoding={"01": "1"}, reference_genome='GRCh37', min_partitions=3).add_col_index().add_row_index() out1 = new_temp_file() hl.export_gen(gen, out1, id1=hl.str(gen.col_idx), id2=hl.str(gen.col_idx), missing=0.5, varid=hl.str(gen.row_idx), rsid=hl.str(gen.row_idx), gp=[0.0, 1.0, 0.0]) in1 = (hl.import_gen(out1 + '.gen', sample_file=out1 + '.sample', min_partitions=3) .add_col_index() .add_row_index()) self.assertTrue(in1.aggregate_entries(hl.agg.fraction(in1.GP == [0.0, 1.0, 0.0])) == 1.0) self.assertTrue(in1.aggregate_rows(hl.agg.fraction((in1.varid == hl.str(in1.row_idx)) & (in1.rsid == hl.str(in1.row_idx)))) == 1.0) self.assertTrue(in1.aggregate_cols(hl.agg.fraction((in1.s == hl.str(in1.col_idx)))))
def test_linear_mixed_regression_pass_through(self): x_table = hl.import_table(resource('fastlmmCov.txt'), no_header=True, impute=True).key_by('f1') y_table = hl.import_table(resource('fastlmmPheno.txt'), no_header=True, impute=True, delimiter=' ').key_by('f1') mt = hl.import_plink(bed=resource('fastlmmTest.bed'), bim=resource('fastlmmTest.bim'), fam=resource('fastlmmTest.fam'), reference_genome=None) mt = mt.annotate_cols(x=x_table[mt.col_key].f2) mt = mt.annotate_cols(y=y_table[mt.col_key].f2).cache() p_path = utils.new_temp_file() mt_chr1 = mt.filter_rows((mt.locus.contig == '1') & (mt.locus.position < 200)) model, _ = hl.linear_mixed_model(y=mt_chr1.y, x=[1, mt_chr1.x], z_t=mt_chr1.GT.n_alt_alleles(), p_path=p_path) model.fit(log_gamma=0) mt_chr3 = mt.filter_rows((mt.locus.contig == '3') & (mt.locus.position < 2005)) mt_chr3 = mt_chr3.annotate_rows(stats=hl.agg.stats(mt_chr3.GT.n_alt_alleles()), foo=hl.struct(bar=hl.rand_norm(0, 1))) ht = hl.linear_mixed_regression_rows((mt_chr3.GT.n_alt_alleles() - mt_chr3.stats.mean) / mt_chr3.stats.stdev, model, pass_through=['stats', mt_chr3.foo.bar, mt_chr3.cm_position]) assert mt_chr3.aggregate_rows(hl.agg.all(mt_chr3.foo.bar == ht[mt_chr3.row_key].bar))
def test_import_locus_intervals(self): interval_file = resource('annotinterall.interval_list') t = hl.import_locus_intervals(interval_file, reference_genome='GRCh37') nint = t.count() i = 0 with open(interval_file) as f: for line in f: if len(line.strip()) != 0: i += 1 self.assertEqual(nint, i) self.assertEqual(t.interval.dtype.point_type, hl.tlocus('GRCh37')) tmp_file = new_temp_file(prefix="test", suffix="interval_list") start = t.interval.start end = t.interval.end (t .key_by(interval=hl.locus_interval(start.contig, start.position, end.position, True, True)) .select() .export(tmp_file, header=False)) t2 = hl.import_locus_intervals(tmp_file) self.assertTrue(t.select()._same(t2))
def test_write_stage_locally(self): t = hl.utils.range_table(5) f = new_temp_file(suffix='ht') t.write(f, stage_locally=True) t2 = hl.read_table(f) self.assertTrue(t._same(t2))
def test_export_plink_exprs(self): ds = get_dataset() fam_mapping = {'f0': 'fam_id', 'f1': 'ind_id', 'f2': 'pat_id', 'f3': 'mat_id', 'f4': 'is_female', 'f5': 'pheno'} bim_mapping = {'f0': 'contig', 'f1': 'varid', 'f2': 'cm_position', 'f3': 'position', 'f4': 'a1', 'f5': 'a2'} # Test default arguments out1 = new_temp_file() hl.export_plink(ds, out1) fam1 = (hl.import_table(out1 + '.fam', no_header=True, impute=False, missing="") .rename(fam_mapping)) bim1 = (hl.import_table(out1 + '.bim', no_header=True, impute=False) .rename(bim_mapping)) self.assertTrue(fam1.all((fam1.fam_id == "0") & (fam1.pat_id == "0") & (fam1.mat_id == "0") & (fam1.is_female == "0") & (fam1.pheno == "NA"))) self.assertTrue(bim1.all((bim1.varid == bim1.contig + ":" + bim1.position + ":" + bim1.a2 + ":" + bim1.a1) & (bim1.cm_position == "0.0"))) # Test non-default FAM arguments out2 = new_temp_file() hl.export_plink(ds, out2, ind_id=ds.s, fam_id=ds.s, pat_id="nope", mat_id="nada", is_female=True, pheno=False) fam2 = (hl.import_table(out2 + '.fam', no_header=True, impute=False, missing="") .rename(fam_mapping)) self.assertTrue(fam2.all((fam2.fam_id == fam2.ind_id) & (fam2.pat_id == "nope") & (fam2.mat_id == "nada") & (fam2.is_female == "2") & (fam2.pheno == "1"))) # Test quantitative phenotype out3 = new_temp_file() hl.export_plink(ds, out3, ind_id=ds.s, pheno=hl.float64(hl.len(ds.s))) fam3 = (hl.import_table(out3 + '.fam', no_header=True, impute=False, missing="") .rename(fam_mapping)) self.assertTrue(fam3.all((fam3.fam_id == "0") & (fam3.pat_id == "0") & (fam3.mat_id == "0") & (fam3.is_female == "0") & (fam3.pheno != "0") & (fam3.pheno != "NA"))) # Test non-default BIM arguments out4 = new_temp_file() hl.export_plink(ds, out4, varid="hello", cm_position=100) bim4 = (hl.import_table(out4 + '.bim', no_header=True, impute=False) .rename(bim_mapping)) self.assertTrue(bim4.all((bim4.varid == "hello") & (bim4.cm_position == "100.0"))) # Test call expr out5 = new_temp_file() ds_call = ds.annotate_entries(gt_fake=hl.call(0, 0)) hl.export_plink(ds_call, out5, call=ds_call.gt_fake) ds_all_hom_ref = hl.import_plink(out5 + '.bed', out5 + '.bim', out5 + '.fam') nerrors = ds_all_hom_ref.aggregate_entries(hl.agg.count_where(~ds_all_hom_ref.GT.is_hom_ref())) self.assertTrue(nerrors == 0) # Test white-space in FAM id expr raises error with self.assertRaisesRegex(TypeError, "has spaces in the following values:"): hl.export_plink(ds, new_temp_file(), mat_id="hello world") # Test white-space in varid expr raises error with self.assertRaisesRegex(FatalError, "no white space allowed:"): hl.export_plink(ds, new_temp_file(), varid="hello world")
def test_ld_score_regression(self): ht_scores = hl.import_table( doctest_resource('ld_score_regression.univariate_ld_scores.tsv'), key='SNP', types={'L2': hl.tfloat, 'BP': hl.tint}) ht_50_irnt = hl.import_table( doctest_resource('ld_score_regression.50_irnt.sumstats.tsv'), key='SNP', types={'N': hl.tint, 'Z': hl.tfloat}) ht_50_irnt = ht_50_irnt.annotate( chi_squared=ht_50_irnt['Z']**2, n=ht_50_irnt['N'], ld_score=ht_scores[ht_50_irnt['SNP']]['L2'], locus=hl.locus(ht_scores[ht_50_irnt['SNP']]['CHR'], ht_scores[ht_50_irnt['SNP']]['BP']), alleles=hl.array([ht_50_irnt['A2'], ht_50_irnt['A1']]), phenotype='50_irnt') ht_50_irnt = ht_50_irnt.key_by(ht_50_irnt['locus'], ht_50_irnt['alleles']) ht_50_irnt = ht_50_irnt.select(ht_50_irnt['chi_squared'], ht_50_irnt['n'], ht_50_irnt['ld_score'], ht_50_irnt['phenotype']) ht_20160 = hl.import_table( doctest_resource('ld_score_regression.20160.sumstats.tsv'), key='SNP', types={'N': hl.tint, 'Z': hl.tfloat}) ht_20160 = ht_20160.annotate( chi_squared=ht_20160['Z']**2, n=ht_20160['N'], ld_score=ht_scores[ht_20160['SNP']]['L2'], locus=hl.locus(ht_scores[ht_20160['SNP']]['CHR'], ht_scores[ht_20160['SNP']]['BP']), alleles=hl.array([ht_20160['A2'], ht_20160['A1']]), phenotype='20160') ht_20160 = ht_20160.key_by(ht_20160['locus'], ht_20160['alleles']) ht_20160 = ht_20160.select(ht_20160['chi_squared'], ht_20160['n'], ht_20160['ld_score'], ht_20160['phenotype']) ht = ht_50_irnt.union(ht_20160) mt = ht.to_matrix_table(row_key=['locus', 'alleles'], col_key=['phenotype'], row_fields=['ld_score'], col_fields=[]) mt_tmp = new_temp_file() mt.write(mt_tmp, overwrite=True) mt = hl.read_matrix_table(mt_tmp) ht_results = hl.experimental.ld_score_regression( weight_expr=mt['ld_score'], ld_score_expr=mt['ld_score'], chi_sq_exprs=mt['chi_squared'], n_samples_exprs=mt['n'], n_blocks=20, two_step_threshold=5, n_reference_panel_variants=1173569) results = { x['phenotype']: { 'mean_chi_sq': x['mean_chi_sq'], 'intercept_estimate': x['intercept']['estimate'], 'intercept_standard_error': x['intercept']['standard_error'], 'snp_heritability_estimate': x['snp_heritability']['estimate'], 'snp_heritability_standard_error': x['snp_heritability']['standard_error']} for x in ht_results.collect()} self.assertAlmostEqual( results['50_irnt']['mean_chi_sq'], 3.4386, places=4) self.assertAlmostEqual( results['50_irnt']['intercept_estimate'], 0.7727, places=4) self.assertAlmostEqual( results['50_irnt']['intercept_standard_error'], 0.2461, places=4) self.assertAlmostEqual( results['50_irnt']['snp_heritability_estimate'], 0.3845, places=4) self.assertAlmostEqual( results['50_irnt']['snp_heritability_standard_error'], 0.1067, places=4) self.assertAlmostEqual( results['20160']['mean_chi_sq'], 1.5209, places=4) self.assertAlmostEqual( results['20160']['intercept_estimate'], 1.2109, places=4) self.assertAlmostEqual( results['20160']['intercept_standard_error'], 0.2238, places=4) self.assertAlmostEqual( results['20160']['snp_heritability_estimate'], 0.0486, places=4) self.assertAlmostEqual( results['20160']['snp_heritability_standard_error'], 0.0416, places=4) ht = ht_50_irnt.annotate( chi_squared_50_irnt=ht_50_irnt['chi_squared'], n_50_irnt=ht_50_irnt['n'], chi_squared_20160=ht_20160[ht_50_irnt.key]['chi_squared'], n_20160=ht_20160[ht_50_irnt.key]['n']) ht_results = hl.experimental.ld_score_regression( weight_expr=ht['ld_score'], ld_score_expr=ht['ld_score'], chi_sq_exprs=[ht['chi_squared_50_irnt'], ht['chi_squared_20160']], n_samples_exprs=[ht['n_50_irnt'], ht['n_20160']], n_blocks=20, two_step_threshold=5, n_reference_panel_variants=1173569) results = { x['phenotype']: { 'mean_chi_sq': x['mean_chi_sq'], 'intercept_estimate': x['intercept']['estimate'], 'intercept_standard_error': x['intercept']['standard_error'], 'snp_heritability_estimate': x['snp_heritability']['estimate'], 'snp_heritability_standard_error': x['snp_heritability']['standard_error']} for x in ht_results.collect()} self.assertAlmostEqual( results[0]['mean_chi_sq'], 3.4386, places=4) self.assertAlmostEqual( results[0]['intercept_estimate'], 0.7727, places=4) self.assertAlmostEqual( results[0]['intercept_standard_error'], 0.2461, places=4) self.assertAlmostEqual( results[0]['snp_heritability_estimate'], 0.3845, places=4) self.assertAlmostEqual( results[0]['snp_heritability_standard_error'], 0.1067, places=4) self.assertAlmostEqual( results[1]['mean_chi_sq'], 1.5209, places=4) self.assertAlmostEqual( results[1]['intercept_estimate'], 1.2109, places=4) self.assertAlmostEqual( results[1]['intercept_standard_error'], 0.2238, places=4) self.assertAlmostEqual( results[1]['snp_heritability_estimate'], 0.0486, places=4) self.assertAlmostEqual( results[1]['snp_heritability_standard_error'], 0.0416, places=4)
def test_linear_mixed_model_fastlmm(self): # FastLMM Test data is from all.bed, all.bim, all.fam, cov.txt, pheno_10_causals.txt: # https://github.com/MicrosoftGenomics/FaST-LMM/tree/master/tests/datasets/synth # # Data is filtered to chromosome 1,3 and samples 0-124,375-499 (2000 variants and 250 samples) # # Results are computed with single_snp (with LOCO) as in: # https://github.com/MicrosoftGenomics/FaST-LMM/blob/master/doc/ipynb/FaST-LMM.ipynb n, m = 250, 1000 # per chromosome x_table = hl.import_table(resource('fastlmmCov.txt'), no_header=True, impute=True).key_by('f1') y_table = hl.import_table(resource('fastlmmPheno.txt'), no_header=True, impute=True, delimiter=' ').key_by('f1') mt = hl.import_plink(bed=resource('fastlmmTest.bed'), bim=resource('fastlmmTest.bim'), fam=resource('fastlmmTest.fam'), reference_genome=None) mt = mt.annotate_cols(x=x_table[mt.col_key].f2) mt = mt.annotate_cols(y=y_table[mt.col_key].f2).cache() x = np.array([np.ones(n), mt.key_cols_by()['x'].collect()]).T y = np.array(mt.key_cols_by()['y'].collect()) mt_chr1 = mt.filter_rows(mt.locus.contig == '1') mt_chr3 = mt.filter_rows(mt.locus.contig == '3') # testing chrom 1 for h2, betas, p-values h2_fastlmm = 0.14276125 beta_fastlmm = [0.012202061, 0.037718282, -0.033572693, 0.29171541, -0.045644170] # FastLMM p-values do not agree to high precision because FastLMM regresses # out x from each SNP first and does an F(1, dof)-test on (beta / se)^2 # (t-test), whereas Hail does likelihood ratio test. # We verify below that Hail's p-values remain fixed going forward. # fastlmm = [0.84650294, 0.57865098, 0.59050998, 1.6649473e-06, 0.46892059] pval_hail = [0.84543084, 0.57596760, 0.58788517, 1.4057279e-06, 0.46578204] gamma_fastlmm = h2_fastlmm / (1 - h2_fastlmm) g = BlockMatrix.from_entry_expr(mt_chr1.GT.n_alt_alleles()).to_numpy().T g_std = self._filter_and_standardize_cols(g) # full rank k = (g_std @ g_std.T) * (n / m) s, u = np.linalg.eigh(k) p = u.T model = LinearMixedModel(p @ y, p @ x, s) model.fit() assert np.isclose(model.h_sq, h2_fastlmm) h2_std_error = 0.13770773 # hard coded having checked against plot assert np.isclose(model.h_sq_standard_error, h2_std_error) h_sq_norm_lkhd = model.h_sq_normalized_lkhd()[1:-1] argmax = int(100 * h2_fastlmm) assert argmax <= np.argmax(h_sq_norm_lkhd) + 1 <= argmax + 1 assert np.isclose(np.sum(h_sq_norm_lkhd), 1.0) mt3_chr3_5var = mt_chr3.filter_rows(mt_chr3.locus.position < 2005) # first 5 a = BlockMatrix.from_entry_expr(mt3_chr3_5var.GT.n_alt_alleles()).to_numpy().T # FastLMM standardizes each variant to have mean 0 and variance 1. a = self._filter_and_standardize_cols(a) * np.sqrt(n) pa = p @ a model.fit(log_gamma=np.log(gamma_fastlmm)) res = model.fit_alternatives_numpy(pa, return_pandas=True) assert np.allclose(res['beta'], beta_fastlmm) assert np.allclose(res['p_value'], pval_hail) pa_t_path = utils.new_temp_file(suffix='bm') BlockMatrix.from_numpy(pa.T).write(pa_t_path, force_row_major=True) res = model.fit_alternatives(pa_t_path).to_pandas() assert np.allclose(res['beta'], beta_fastlmm) assert np.allclose(res['p_value'], pval_hail) # low rank ld = g_std.T @ g_std sl, v = np.linalg.eigh(ld) n_eigenvectors = int(np.sum(sl > 1e-10)) assert n_eigenvectors < n sl = sl[-n_eigenvectors:] v = v[:, -n_eigenvectors:] s = sl * (n / m) p = (g_std @ (v / np.sqrt(sl))).T model = LinearMixedModel(p @ y, p @ x, s, y, x) model.fit() assert np.isclose(model.h_sq, h2_fastlmm) assert np.isclose(model.h_sq_standard_error, h2_std_error) model.fit(log_gamma=np.log(gamma_fastlmm)) pa = p @ a res = model.fit_alternatives_numpy(pa, a, return_pandas=True) assert np.allclose(res['beta'], beta_fastlmm) assert np.allclose(res['p_value'], pval_hail) a_t_path = utils.new_temp_file(suffix='bm') BlockMatrix.from_numpy(a.T).write(a_t_path, force_row_major=True) pa_t_path = utils.new_temp_file(suffix='bm') BlockMatrix.from_numpy(pa.T).write(pa_t_path, force_row_major=True) res = model.fit_alternatives(pa_t_path, a_t_path).to_pandas() assert np.allclose(res['beta'], beta_fastlmm) assert np.allclose(res['p_value'], pval_hail) # testing chrom 3 for h2 h2_fastlmm = 0.36733240 g = BlockMatrix.from_entry_expr(mt_chr3.GT.n_alt_alleles()).to_numpy().T g_std = self._filter_and_standardize_cols(g) # full rank k = (g_std @ g_std.T) * (n / m) s, u = np.linalg.eigh(k) p = u.T model = LinearMixedModel(p @ y, p @ x, s) model.fit() assert np.isclose(model.h_sq, h2_fastlmm) h2_std_error = 0.17409641 # hard coded having checked against plot assert np.isclose(model.h_sq_standard_error, h2_std_error) h_sq_norm_lkhd = model.h_sq_normalized_lkhd()[1:-1] argmax = int(100 * h2_fastlmm) assert argmax <= np.argmax(h_sq_norm_lkhd) + 1 <= argmax + 1 assert np.isclose(np.sum(h_sq_norm_lkhd), 1.0) # low rank l = g_std.T @ g_std sl, v = np.linalg.eigh(l) n_eigenvectors = int(np.sum(sl > 1e-10)) assert n_eigenvectors < n sl = sl[-n_eigenvectors:] v = v[:, -n_eigenvectors:] s = sl * (n / m) p = (g_std @ (v / np.sqrt(sl))).T model = LinearMixedModel(p @ y, p @ x, s, y, x) model.fit() assert np.isclose(model.h_sq, h2_fastlmm) assert np.isclose(model.h_sq_standard_error, h2_std_error)
def test_linear_mixed_model_math(self): gamma = 2.0 # testing at fixed value of gamma n, f, m = 4, 2, 3 y = np.array([0.0, 1.0, 8.0, 9.0]) x = np.array([[1.0, 0.0], [1.0, 2.0], [1.0, 1.0], [1.0, 4.0]]) z = np.array([[0.0, 0.0, 1.0], [0.0, 1.0, 2.0], [1.0, 2.0, 4.0], [2.0, 4.0, 8.0]]) k = z @ z.T v = k + np.eye(4) / gamma v_inv = np.linalg.inv(v) beta = np.linalg.solve(x.T @ v_inv @ x, x.T @ v_inv @ y) residual = y - x @ beta sigma_sq = 1 / (n - f) * (residual @ v_inv @ residual) sv = sigma_sq * v neg_log_lkhd = 0.5 * (np.linalg.slogdet(sv)[1] + np.linalg.slogdet(x.T @ np.linalg.inv(sv) @ x)[1]) # plus C x_star = np.array([1.0, 0.0, 1.0, 0.0]) a = x_star.reshape(n, 1) x1 = np.hstack([a, x]) beta1 = np.linalg.solve(x1.T @ v_inv @ x1, x1.T @ v_inv @ y) residual1 = y - x1 @ beta1 chi_sq = n * np.log((residual @ v_inv @ residual) / (residual1 @ v_inv @ residual1)) # test from_kinship, full-rank fit model, p = LinearMixedModel.from_kinship(y, x, k) s0, u0 = np.linalg.eigh(k) s0 = np.flip(s0, axis=0) p0 = np.fliplr(u0).T self.assertTrue(model._same(LinearMixedModel(p0 @ y, p0 @ x, s0))) model.fit(np.log(gamma)) self.assertTrue(np.allclose(model.beta, beta)) self.assertAlmostEqual(model.sigma_sq, sigma_sq) self.assertAlmostEqual(model.compute_neg_log_reml(np.log(gamma)), neg_log_lkhd) # test full-rank alternative pa = p @ a stats = model.fit_alternatives_numpy(pa).collect()[0] self.assertAlmostEqual(stats.beta, beta1[0]) self.assertAlmostEqual(stats.chi_sq, chi_sq) pa_t_path = utils.new_temp_file() BlockMatrix.from_numpy(pa.T).write(pa_t_path, force_row_major=True) stats = model.fit_alternatives(pa_t_path).collect()[0] self.assertAlmostEqual(stats.beta, beta1[0]) self.assertAlmostEqual(stats.chi_sq, chi_sq) # test from_random_effects, low-rank fit s0, p0 = s0[:m], p0[:m, :] # test BlockMatrix path temp_path = utils.new_temp_file() model, _ = LinearMixedModel.from_random_effects(y, x, BlockMatrix.from_numpy(z), p_path=temp_path, complexity_bound=0) lmm = LinearMixedModel(p0 @ y, p0 @ x, s0, y, x, p_path=temp_path) self.assertTrue(model._same(lmm)) # test ndarray path model, p = LinearMixedModel.from_random_effects(y, x, z) lmm = LinearMixedModel(p0 @ y, p0 @ x, s0, y, x) self.assertTrue(model._same(lmm)) model.fit(np.log(gamma)) self.assertTrue(np.allclose(model.beta, beta)) self.assertAlmostEqual(model.sigma_sq, sigma_sq) self.assertAlmostEqual(model.compute_neg_log_reml(np.log(gamma)), neg_log_lkhd) # test low_rank alternative pa = p @ a stats = model.fit_alternatives_numpy(pa, a).collect()[0] self.assertAlmostEqual(stats.beta, beta1[0]) self.assertAlmostEqual(stats.chi_sq, chi_sq) a_t_path = utils.new_temp_file() BlockMatrix.from_numpy(a.T).write(a_t_path, force_row_major=True) pa_t_path = utils.new_temp_file() BlockMatrix.from_numpy(pa.T).write(pa_t_path, force_row_major=True) stats = model.fit_alternatives(pa_t_path, a_t_path).collect()[0] self.assertAlmostEqual(stats.beta, beta1[0]) self.assertAlmostEqual(stats.chi_sq, chi_sq)
def value_irs(self): b = ir.TrueIR() c = ir.Ref('c') i = ir.I32(5) j = ir.I32(7) st = ir.Str('Hail') a = ir.Ref('a') aa = ir.Ref('aa') da = ir.Ref('da') nd = ir.Ref('nd') v = ir.Ref('v') s = ir.Ref('s') t = ir.Ref('t') call = ir.Ref('call') table = ir.TableRange(5, 3) matrix_read = ir.MatrixRead(ir.MatrixNativeReader( resource('backward_compatability/1.0.0/matrix_table/0.hmt')), False, False) block_matrix_read = ir.BlockMatrixRead(ir.BlockMatrixNativeReader('fake_file_path')) value_irs = [ i, ir.I64(5), ir.F32(3.14), ir.F64(3.14), s, ir.TrueIR(), ir.FalseIR(), ir.Void(), ir.Cast(i, hl.tfloat64), ir.NA(hl.tint32), ir.IsNA(i), ir.If(b, i, j), ir.Let('v', i, v), ir.Ref('x'), ir.ApplyBinaryPrimOp('+', i, j), ir.ApplyUnaryPrimOp('-', i), ir.ApplyComparisonOp('EQ', i, j), ir.MakeArray([i, ir.NA(hl.tint32), ir.I32(-3)], hl.tarray(hl.tint32)), ir.ArrayRef(a, i), ir.ArrayLen(a), ir.ArrayRange(ir.I32(0), ir.I32(5), ir.I32(1)), ir.ArraySort(a, 'l', 'r', ir.ApplyComparisonOp("LT", ir.Ref('l'), ir.Ref('r'))), ir.ToSet(a), ir.ToDict(da), ir.ToArray(a), ir.MakeNDArray(2, ir.MakeArray([ir.F64(-1.0), ir.F64(1.0)], hl.tarray(hl.tfloat64)), ir.MakeArray([ir.I64(1), ir.I64(2)], hl.tarray(hl.tint64)), ir.TrueIR()), ir.NDArrayRef(nd, [ir.I64(1), ir.I64(2)]), ir.LowerBoundOnOrderedCollection(a, i, True), ir.GroupByKey(da), ir.ArrayMap(a, 'v', v), ir.ArrayFilter(a, 'v', v), ir.ArrayFlatMap(aa, 'v', v), ir.ArrayFold(a, ir.I32(0), 'x', 'v', v), ir.ArrayScan(a, ir.I32(0), 'x', 'v', v), ir.ArrayLeftJoinDistinct(a, a, 'l', 'r', ir.I32(0), ir.I32(1)), ir.ArrayFor(a, 'v', ir.Void()), ir.AggFilter(ir.TrueIR(), ir.I32(0), False), ir.AggExplode(ir.ArrayRange(ir.I32(0), ir.I32(2), ir.I32(1)), 'x', ir.I32(0), False), ir.AggGroupBy(ir.TrueIR(), ir.I32(0), False), ir.AggArrayPerElement(ir.ArrayRange(ir.I32(0), ir.I32(2), ir.I32(1)), 'x', 'y', ir.I32(0), False), ir.ApplyAggOp('Collect', [], None, [ir.I32(0)]), ir.ApplyScanOp('Collect', [], None, [ir.I32(0)]), ir.ApplyAggOp('Histogram', [ir.F64(-5.0), ir.F64(5.0), ir.I32(100)], None, [ir.F64(-2.11)]), ir.ApplyAggOp('CallStats', [], [ir.I32(2)], [call]), ir.ApplyAggOp('TakeBy', [ir.I32(10)], None, [ir.F64(-2.11), ir.F64(-2.11)]), ir.Begin([ir.Void()]), ir.MakeStruct([('x', i)]), ir.SelectFields(s, ['x', 'z']), ir.InsertFields(s, [('x', i)], None), ir.GetField(s, 'x'), ir.MakeTuple([i, b]), ir.GetTupleElement(t, 1), ir.In(2, hl.tfloat64), ir.Die(ir.Str('mumblefoo'), hl.tfloat64), ir.Apply('&&', b, c), ir.Apply('toFloat64', i), ir.Uniroot('x', ir.F64(3.14), ir.F64(-5.0), ir.F64(5.0)), ir.Literal(hl.tarray(hl.tint32), [1, 2, None]), ir.TableCount(table), ir.TableGetGlobals(table), ir.TableCollect(table), ir.TableToValueApply(table, {'name': 'ForceCountTable'}), ir.MatrixToValueApply(matrix_read, {'name': 'ForceCountMatrixTable'}), ir.TableAggregate(table, ir.MakeStruct([('foo', ir.ApplyAggOp('Collect', [], None, [ir.I32(0)]))])), ir.TableWrite(table, ir.TableNativeWriter(new_temp_file(), False, True, "fake_codec_spec$$")), ir.TableWrite(table, ir.TableTextWriter(new_temp_file(), None, True, 0, ",")), ir.MatrixAggregate(matrix_read, ir.MakeStruct([('foo', ir.ApplyAggOp('Collect', [], None, [ir.I32(0)]))])), ir.MatrixWrite(matrix_read, ir.MatrixNativeWriter(new_temp_file(), False, False, "")), ir.MatrixWrite(matrix_read, ir.MatrixVCFWriter(new_temp_file(), None, False, None)), ir.MatrixWrite(matrix_read, ir.MatrixGENWriter(new_temp_file(), 4)), ir.MatrixWrite(matrix_read, ir.MatrixPLINKWriter(new_temp_file())), ir.MatrixMultiWrite([matrix_read, matrix_read], ir.MatrixNativeMultiWriter(new_temp_file(), False, False)), ir.BlockMatrixWrite(block_matrix_read, ir.BlockMatrixNativeWriter('fake_file_path', False, False, False)) ] return value_irs
def maximal_independent_set(i, j, keep=True, tie_breaker=None, keyed=True) -> Table: """Return a table containing the vertices in a near `maximal independent set <https://en.wikipedia.org/wiki/Maximal_independent_set>`_ of an undirected graph whose edges are given by a two-column table. Examples -------- Run PC-relate and compute pairs of closely related individuals: >>> pc_rel = hl.pc_relate(dataset.GT, 0.001, k=2, statistics='kin') >>> pairs = pc_rel.filter(pc_rel['kin'] > 0.125) Starting from the above pairs, prune individuals from a dataset until no close relationships remain: >>> related_samples_to_remove = hl.maximal_independent_set(pairs.i, pairs.j, False) >>> result = dataset.filter_cols( ... hl.is_defined(related_samples_to_remove[dataset.col_key]), keep=False) Starting from the above pairs, prune individuals from a dataset until no close relationships remain, preferring to keep cases over controls: >>> samples = dataset.cols() >>> pairs_with_case = pairs.key_by( ... i=hl.struct(id=pairs.i, is_case=samples[pairs.i].is_case), ... j=hl.struct(id=pairs.j, is_case=samples[pairs.j].is_case)) >>> def tie_breaker(l, r): ... return hl.cond(l.is_case & ~r.is_case, -1, ... hl.cond(~l.is_case & r.is_case, 1, 0)) >>> related_samples_to_remove = hl.maximal_independent_set( ... pairs_with_case.i, pairs_with_case.j, False, tie_breaker) >>> result = dataset.filter_cols(hl.is_defined( ... related_samples_to_remove.key_by( ... s = related_samples_to_remove.node.id.s)[dataset.col_key]), keep=False) Notes ----- The vertex set of the graph is implicitly all the values realized by `i` and `j` on the rows of this table. Each row of the table corresponds to an undirected edge between the vertices given by evaluating `i` and `j` on that row. An undirected edge may appear multiple times in the table and will not affect the output. Vertices with self-edges are removed as they are not independent of themselves. The expressions for `i` and `j` must have the same type. The value of `keep` determines whether the vertices returned are those in the maximal independent set, or those in the complement of this set. This is useful if you need to filter a table without removing vertices that don't appear in the graph at all. This method implements a greedy algorithm which iteratively removes a vertex of highest degree until the graph contains no edges. The greedy algorithm always returns an independent set, but the set may not always be perfectly maximal. `tie_breaker` is a Python function taking two arguments---say `l` and `r`---each of which is an :class:`Expression` of the same type as `i` and `j`. `tie_breaker` returns a :class:`NumericExpression`, which defines an ordering on nodes. A pair of nodes can be ordered in one of three ways, and `tie_breaker` must encode the relationship as follows: - if ``l < r`` then ``tie_breaker`` evaluates to some negative integer - if ``l == r`` then ``tie_breaker`` evaluates to 0 - if ``l > r`` then ``tie_breaker`` evaluates to some positive integer For example, the usual ordering on the integers is defined by: ``l - r``. The `tie_breaker` function must satisfy the following property: ``tie_breaker(l, r) == -tie_breaker(r, l)``. When multiple nodes have the same degree, this algorithm will order the nodes according to ``tie_breaker`` and remove the *largest* node. Parameters ---------- i : :class:`.Expression` Expression to compute one endpoint of an edge. j : :class:`.Expression` Expression to compute another endpoint of an edge. keep : :obj:`bool` If ``True``, return vertices in set. If ``False``, return vertices removed. tie_breaker : function Function used to order nodes with equal degree. keyed : :obj:`bool` If ``True``, key the resulting table by the `node` field, this requires a sort. Returns ------- :class:`.Table` Table with the set of independent vertices. The table schema is one row field `node` which has the same type as input expressions `i` and `j`. """ if i.dtype != j.dtype: raise ValueError("'maximal_independent_set' expects arguments `i` and `j` to have same type. " "Found {} and {}.".format(i.dtype, j.dtype)) source = i._indices.source if not isinstance(source, Table): raise ValueError("'maximal_independent_set' expects an expression of 'Table'. Found {}".format( "expression of '{}'".format( source.__class__) if source is not None else 'scalar expression')) if i._indices.source != j._indices.source: raise ValueError( "'maximal_independent_set' expects arguments `i` and `j` to be expressions of the same Table. " "Found\n{}\n{}".format(i, j)) node_t = i.dtype if tie_breaker: wrapped_node_t = ttuple(node_t) l = construct_variable('l', wrapped_node_t) r = construct_variable('r', wrapped_node_t) tie_breaker_expr = hl.int64(tie_breaker(l[0], r[0])) t, _ = source._process_joins(i, j, tie_breaker_expr) tie_breaker_str = str(tie_breaker_expr._ir) else: t, _ = source._process_joins(i, j) tie_breaker_str = None edges = t.select(__i=i, __j=j).key_by().select('__i', '__j') edges_path = new_temp_file() edges.write(edges_path) edges = hl.read_table(edges_path) mis_nodes = construct_expr(JavaIR(Env.hail().utils.Graph.pyMaximalIndependentSet( Env.spark_backend('maximal_independent_set')._to_java_ir(edges.collect(_localize=False)._ir), node_t._parsable_string(), joption(tie_breaker_str))), hl.tset(node_t)) nodes = edges.select(node = [edges.__i, edges.__j]) nodes = nodes.explode(nodes.node) nodes = nodes.annotate_globals(mis_nodes=mis_nodes) nodes = nodes.filter(nodes.mis_nodes.contains(nodes.node), keep) nodes = nodes.select_globals() if keyed: return nodes.key_by('node') return nodes
def ld_score_regression(weight_expr, ld_score_expr, chi_sq_exprs, n_samples_exprs, n_blocks=200, two_step_threshold=30, n_reference_panel_variants=None) -> Table: r"""Estimate SNP-heritability and level of confounding biases from GWAS summary statistics. Given a set or multiple sets of genome-wide association study (GWAS) summary statistics, :func:`.ld_score_regression` estimates the heritability of a trait or set of traits and the level of confounding biases present in the underlying studies by regressing chi-squared statistics on LD scores, leveraging the model: .. math:: \mathrm{E}[\chi_j^2] = 1 + Na + \frac{Nh_g^2}{M}l_j * :math:`\mathrm{E}[\chi_j^2]` is the expected chi-squared statistic for variant :math:`j` resulting from a test of association between variant :math:`j` and a trait. * :math:`l_j = \sum_{k} r_{jk}^2` is the LD score of variant :math:`j`, calculated as the sum of squared correlation coefficients between variant :math:`j` and nearby variants. See :func:`ld_score` for further details. * :math:`a` captures the contribution of confounding biases, such as cryptic relatedness and uncontrolled population structure, to the association test statistic. * :math:`h_g^2` is the SNP-heritability, or the proportion of variation in the trait explained by the effects of variants included in the regression model above. * :math:`M` is the number of variants used to estimate :math:`h_g^2`. * :math:`N` is the number of samples in the underlying association study. For more details on the method implemented in this function, see: * `LD Score regression distinguishes confounding from polygenicity in genome-wide association studies (Bulik-Sullivan et al, 2015) <https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4495769/>`__ Examples -------- Run the method on a matrix table of summary statistics, where the rows are variants and the columns are different phenotypes: >>> mt_gwas = hl.read_matrix_table('data/ld_score_regression.sumstats.mt') >>> ht_results = hl.experimental.ld_score_regression( ... weight_expr=mt_gwas['ld_score'], ... ld_score_expr=mt_gwas['ld_score'], ... chi_sq_exprs=mt_gwas['chi_squared'], ... n_samples_exprs=mt_gwas['n']) Run the method on a table with summary statistics for a single phenotype: >>> ht_gwas = hl.read_table('data/ld_score_regression.sumstats.ht') >>> ht_results = hl.experimental.ld_score_regression( ... weight_expr=ht_gwas['ld_score'], ... ld_score_expr=ht_gwas['ld_score'], ... chi_sq_exprs=ht_gwas['chi_squared_50_irnt'], ... n_samples_exprs=ht_gwas['n_50_irnt']) Run the method on a table with summary statistics for multiple phenotypes: >>> ht_gwas = hl.read_table('data/ld_score_regression.sumstats.ht') >>> ht_results = hl.experimental.ld_score_regression( ... weight_expr=ht_gwas['ld_score'], ... ld_score_expr=ht_gwas['ld_score'], ... chi_sq_exprs=[ht_gwas['chi_squared_50_irnt'], ... ht_gwas['chi_squared_20160']], ... n_samples_exprs=[ht_gwas['n_50_irnt'], ... ht_gwas['n_20160']]) Notes ----- The ``exprs`` provided as arguments to :func:`.ld_score_regression` must all be from the same object, either a :class:`Table` or a :class:`MatrixTable`. **If the arguments originate from a table:** * The table must be keyed by fields ``locus`` of type :class:`.tlocus` and ``alleles``, a :py:data:`.tarray` of :py:data:`.tstr` elements. * ``weight_expr``, ``ld_score_expr``, ``chi_sq_exprs``, and ``n_samples_exprs`` are must be row-indexed fields. * The number of expressions passed to ``n_samples_exprs`` must be equal to one or the number of expressions passed to ``chi_sq_exprs``. If just one expression is passed to ``n_samples_exprs``, that sample size expression is assumed to apply to all sets of statistics passed to ``chi_sq_exprs``. Otherwise, the expressions passed to ``chi_sq_exprs`` and ``n_samples_exprs`` are matched by index. * The ``phenotype`` field that keys the table returned by :func:`.ld_score_regression` will have generic :obj:`int` values ``0``, ``1``, etc. corresponding to the ``0th``, ``1st``, etc. expressions passed to the ``chi_sq_exprs`` argument. **If the arguments originate from a matrix table:** * The dimensions of the matrix table must be variants (rows) by phenotypes (columns). * The rows of the matrix table must be keyed by fields ``locus`` of type :class:`.tlocus` and ``alleles``, a :py:data:`.tarray` of :py:data:`.tstr` elements. * The columns of the matrix table must be keyed by a field of type :py:data:`.tstr` that uniquely identifies phenotypes represented in the matrix table. The column key must be a single expression; compound keys are not accepted. * ``weight_expr`` and ``ld_score_expr`` must be row-indexed fields. * ``chi_sq_exprs`` must be a single entry-indexed field (not a list of fields). * ``n_samples_exprs`` must be a single entry-indexed field (not a list of fields). * The ``phenotype`` field that keys the table returned by :func:`.ld_score_regression` will have values corresponding to the column keys of the input matrix table. This function returns a :class:`Table` with one row per set of summary statistics passed to the ``chi_sq_exprs`` argument. The following row-indexed fields are included in the table: * **phenotype** (:py:data:`.tstr`) -- The name of the phenotype. The returned table is keyed by this field. See the notes below for details on the possible values of this field. * **mean_chi_sq** (:py:data:`.tfloat64`) -- The mean chi-squared test statistic for the given phenotype. * **intercept** (`Struct`) -- Contains fields: - **estimate** (:py:data:`.tfloat64`) -- A point estimate of the intercept :math:`1 + Na`. - **standard_error** (:py:data:`.tfloat64`) -- An estimate of the standard error of this point estimate. * **snp_heritability** (`Struct`) -- Contains fields: - **estimate** (:py:data:`.tfloat64`) -- A point estimate of the SNP-heritability :math:`h_g^2`. - **standard_error** (:py:data:`.tfloat64`) -- An estimate of the standard error of this point estimate. Warning ------- :func:`.ld_score_regression` considers only the rows for which both row fields ``weight_expr`` and ``ld_score_expr`` are defined. Rows with missing values in either field are removed prior to fitting the LD score regression model. Parameters ---------- weight_expr : :class:`.Float64Expression` Row-indexed expression for the LD scores used to derive variant weights in the model. ld_score_expr : :class:`.Float64Expression` Row-indexed expression for the LD scores used as covariates in the model. chi_sq_exprs : :class:`.Float64Expression` or :obj:`list` of :class:`.Float64Expression` One or more row-indexed (if table) or entry-indexed (if matrix table) expressions for chi-squared statistics resulting from genome-wide association studies. n_samples_exprs: :class:`.NumericExpression` or :obj:`list` of :class:`.NumericExpression` One or more row-indexed (if table) or entry-indexed (if matrix table) expressions indicating the number of samples used in the studies that generated the test statistics supplied to ``chi_sq_exprs``. n_blocks : :obj:`int` The number of blocks used in the jackknife approach to estimating standard errors. two_step_threshold : :obj:`int` Variants with chi-squared statistics greater than this value are excluded in the first step of the two-step procedure used to fit the model. n_reference_panel_variants : :obj:`int`, optional Number of variants used to estimate the SNP-heritability :math:`h_g^2`. Returns ------- :class:`.Table` Table keyed by ``phenotype`` with intercept and heritability estimates for each phenotype passed to the function.""" chi_sq_exprs = wrap_to_list(chi_sq_exprs) n_samples_exprs = wrap_to_list(n_samples_exprs) assert ((len(chi_sq_exprs) == len(n_samples_exprs)) or (len(n_samples_exprs) == 1)) __k = 2 # number of covariates, including intercept ds = chi_sq_exprs[0]._indices.source analyze('ld_score_regression/weight_expr', weight_expr, ds._row_indices) analyze('ld_score_regression/ld_score_expr', ld_score_expr, ds._row_indices) # format input dataset if isinstance(ds, MatrixTable): if len(chi_sq_exprs) != 1: raise ValueError("""Only one chi_sq_expr allowed if originating from a matrix table.""") if len(n_samples_exprs) != 1: raise ValueError("""Only one n_samples_expr allowed if originating from a matrix table.""") col_key = list(ds.col_key) if len(col_key) != 1: raise ValueError("""Matrix table must be keyed by a single phenotype field.""") analyze('ld_score_regression/chi_squared_expr', chi_sq_exprs[0], ds._entry_indices) analyze('ld_score_regression/n_samples_expr', n_samples_exprs[0], ds._entry_indices) ds = ds._select_all(row_exprs={'__locus': ds.locus, '__alleles': ds.alleles, '__w_initial': weight_expr, '__w_initial_floor': hl.max(weight_expr, 1.0), '__x': ld_score_expr, '__x_floor': hl.max(ld_score_expr, 1.0)}, row_key=['__locus', '__alleles'], col_exprs={'__y_name': ds[col_key[0]]}, col_key=['__y_name'], entry_exprs={'__y': chi_sq_exprs[0], '__n': n_samples_exprs[0]}) ds = ds.annotate_entries(**{'__w': ds.__w_initial}) ds = ds.filter_rows(hl.is_defined(ds.__locus) & hl.is_defined(ds.__alleles) & hl.is_defined(ds.__w_initial) & hl.is_defined(ds.__x)) else: assert isinstance(ds, Table) for y in chi_sq_exprs: analyze('ld_score_regression/chi_squared_expr', y, ds._row_indices) for n in n_samples_exprs: analyze('ld_score_regression/n_samples_expr', n, ds._row_indices) ys = ['__y{:}'.format(i) for i, _ in enumerate(chi_sq_exprs)] ws = ['__w{:}'.format(i) for i, _ in enumerate(chi_sq_exprs)] ns = ['__n{:}'.format(i) for i, _ in enumerate(n_samples_exprs)] ds = ds.select(**dict(**{'__locus': ds.locus, '__alleles': ds.alleles, '__w_initial': weight_expr, '__x': ld_score_expr}, **{y: chi_sq_exprs[i] for i, y in enumerate(ys)}, **{w: weight_expr for w in ws}, **{n: n_samples_exprs[i] for i, n in enumerate(ns)})) ds = ds.key_by(ds.__locus, ds.__alleles) table_tmp_file = new_temp_file() ds.write(table_tmp_file) ds = hl.read_table(table_tmp_file) hts = [ds.select(**{'__w_initial': ds.__w_initial, '__w_initial_floor': hl.max(ds.__w_initial, 1.0), '__x': ds.__x, '__x_floor': hl.max(ds.__x, 1.0), '__y_name': i, '__y': ds[ys[i]], '__w': ds[ws[i]], '__n': hl.int(ds[ns[i]])}) for i, y in enumerate(ys)] mts = [ht.to_matrix_table(row_key=['__locus', '__alleles'], col_key=['__y_name'], row_fields=['__w_initial', '__w_initial_floor', '__x', '__x_floor']) for ht in hts] ds = mts[0] for i in range(1, len(ys)): ds = ds.union_cols(mts[i]) ds = ds.filter_rows(hl.is_defined(ds.__locus) & hl.is_defined(ds.__alleles) & hl.is_defined(ds.__w_initial) & hl.is_defined(ds.__x)) mt_tmp_file1 = new_temp_file() ds.write(mt_tmp_file1) mt = hl.read_matrix_table(mt_tmp_file1) if not n_reference_panel_variants: M = mt.count_rows() else: M = n_reference_panel_variants # block variants for each phenotype n_phenotypes = mt.count_cols() mt = mt.annotate_entries(__in_step1=(hl.is_defined(mt.__y) & (mt.__y < two_step_threshold)), __in_step2=hl.is_defined(mt.__y)) mt = mt.annotate_cols(__col_idx=hl.int(hl.scan.count()), __m_step1=hl.agg.count_where(mt.__in_step1), __m_step2=hl.agg.count_where(mt.__in_step2)) col_keys = list(mt.col_key) ht = mt.localize_entries(entries_array_field_name='__entries', columns_array_field_name='__cols') ht = ht.annotate(__entries=hl.rbind( hl.scan.array_agg( lambda entry: hl.scan.count_where(entry.__in_step1), ht.__entries), lambda step1_indices: hl.map( lambda i: hl.rbind( hl.int(hl.or_else(step1_indices[i], 0)), ht.__cols[i].__m_step1, ht.__entries[i], lambda step1_idx, m_step1, entry: hl.rbind( hl.map( lambda j: hl.int(hl.floor(j * (m_step1 / n_blocks))), hl.range(0, n_blocks + 1)), lambda step1_separators: hl.rbind( hl.set(step1_separators).contains(step1_idx), hl.sum( hl.map( lambda s1: step1_idx >= s1, step1_separators)) - 1, lambda is_separator, step1_block: entry.annotate( __step1_block=step1_block, __step2_block=hl.cond(~entry.__in_step1 & is_separator, step1_block - 1, step1_block))))), hl.range(0, hl.len(ht.__entries))))) mt = ht._unlocalize_entries('__entries', '__cols', col_keys) mt_tmp_file2 = new_temp_file() mt.write(mt_tmp_file2) mt = hl.read_matrix_table(mt_tmp_file2) # initial coefficient estimates mt = mt.annotate_cols(__initial_betas=[ 1.0, (hl.agg.mean(mt.__y) - 1.0) / hl.agg.mean(mt.__x)]) mt = mt.annotate_cols(__step1_betas=mt.__initial_betas, __step2_betas=mt.__initial_betas) # step 1 iteratively reweighted least squares for i in range(3): mt = mt.annotate_entries(__w=hl.cond( mt.__in_step1, 1.0/(mt.__w_initial_floor * 2.0 * (mt.__step1_betas[0] + mt.__step1_betas[1] * mt.__x_floor)**2), 0.0)) mt = mt.annotate_cols(__step1_betas=hl.agg.filter( mt.__in_step1, hl.agg.linreg(y=mt.__y, x=[1.0, mt.__x], weight=mt.__w).beta)) mt = mt.annotate_cols(__step1_h2=hl.max(hl.min( mt.__step1_betas[1] * M / hl.agg.mean(mt.__n), 1.0), 0.0)) mt = mt.annotate_cols(__step1_betas=[ mt.__step1_betas[0], mt.__step1_h2 * hl.agg.mean(mt.__n) / M]) # step 1 block jackknife mt = mt.annotate_cols(__step1_block_betas=[ hl.agg.filter((mt.__step1_block != i) & mt.__in_step1, hl.agg.linreg(y=mt.__y, x=[1.0, mt.__x], weight=mt.__w).beta) for i in range(n_blocks)]) mt = mt.annotate_cols(__step1_block_betas_bias_corrected=hl.map( lambda x: n_blocks * mt.__step1_betas - (n_blocks - 1) * x, mt.__step1_block_betas)) mt = mt.annotate_cols( __step1_jackknife_mean=hl.map( lambda i: hl.mean( hl.map(lambda x: x[i], mt.__step1_block_betas_bias_corrected)), hl.range(0, __k)), __step1_jackknife_variance=hl.map( lambda i: (hl.sum( hl.map(lambda x: x[i]**2, mt.__step1_block_betas_bias_corrected)) - hl.sum( hl.map(lambda x: x[i], mt.__step1_block_betas_bias_corrected))**2 / n_blocks) / (n_blocks - 1) / n_blocks, hl.range(0, __k))) # step 2 iteratively reweighted least squares for i in range(3): mt = mt.annotate_entries(__w=hl.cond( mt.__in_step2, 1.0/(mt.__w_initial_floor * 2.0 * (mt.__step2_betas[0] + mt.__step2_betas[1] * mt.__x_floor)**2), 0.0)) mt = mt.annotate_cols(__step2_betas=[ mt.__step1_betas[0], hl.agg.filter(mt.__in_step2, hl.agg.linreg(y=mt.__y - mt.__step1_betas[0], x=[mt.__x], weight=mt.__w).beta[0])]) mt = mt.annotate_cols(__step2_h2=hl.max(hl.min( mt.__step2_betas[1] * M/hl.agg.mean(mt.__n), 1.0), 0.0)) mt = mt.annotate_cols(__step2_betas=[ mt.__step1_betas[0], mt.__step2_h2 * hl.agg.mean(mt.__n)/M]) # step 2 block jackknife mt = mt.annotate_cols(__step2_block_betas=[ hl.agg.filter((mt.__step2_block != i) & mt.__in_step2, hl.agg.linreg(y=mt.__y - mt.__step1_betas[0], x=[mt.__x], weight=mt.__w).beta[0]) for i in range(n_blocks)]) mt = mt.annotate_cols(__step2_block_betas_bias_corrected=hl.map( lambda x: n_blocks * mt.__step2_betas[1] - (n_blocks - 1) * x, mt.__step2_block_betas)) mt = mt.annotate_cols( __step2_jackknife_mean=hl.mean( mt.__step2_block_betas_bias_corrected), __step2_jackknife_variance=( hl.sum(mt.__step2_block_betas_bias_corrected**2) - hl.sum(mt.__step2_block_betas_bias_corrected)**2 / n_blocks) / (n_blocks - 1) / n_blocks) # combine step 1 and step 2 block jackknifes mt = mt.annotate_entries( __step2_initial_w=1.0/(mt.__w_initial_floor * 2.0 * (mt.__initial_betas[0] + mt.__initial_betas[1] * mt.__x_floor)**2)) mt = mt.annotate_cols( __final_betas=[ mt.__step1_betas[0], mt.__step2_betas[1]], __c=(hl.agg.sum(mt.__step2_initial_w * mt.__x) / hl.agg.sum(mt.__step2_initial_w * mt.__x**2))) mt = mt.annotate_cols(__final_block_betas=hl.map( lambda i: (mt.__step2_block_betas[i] - mt.__c * (mt.__step1_block_betas[i][0] - mt.__final_betas[0])), hl.range(0, n_blocks))) mt = mt.annotate_cols( __final_block_betas_bias_corrected=(n_blocks * mt.__final_betas[1] - (n_blocks - 1) * mt.__final_block_betas)) mt = mt.annotate_cols( __final_jackknife_mean=[ mt.__step1_jackknife_mean[0], hl.mean(mt.__final_block_betas_bias_corrected)], __final_jackknife_variance=[ mt.__step1_jackknife_variance[0], (hl.sum(mt.__final_block_betas_bias_corrected**2) - hl.sum(mt.__final_block_betas_bias_corrected)**2 / n_blocks) / (n_blocks - 1) / n_blocks]) # convert coefficient to heritability estimate mt = mt.annotate_cols( phenotype=mt.__y_name, mean_chi_sq=hl.agg.mean(mt.__y), intercept=hl.struct( estimate=mt.__final_betas[0], standard_error=hl.sqrt(mt.__final_jackknife_variance[0])), snp_heritability=hl.struct( estimate=(M/hl.agg.mean(mt.__n)) * mt.__final_betas[1], standard_error=hl.sqrt((M/hl.agg.mean(mt.__n))**2 * mt.__final_jackknife_variance[1]))) # format and return results ht = mt.cols() ht = ht.key_by(ht.phenotype) ht = ht.select(ht.mean_chi_sq, ht.intercept, ht.snp_heritability) ht_tmp_file = new_temp_file() ht.write(ht_tmp_file) ht = hl.read_table(ht_tmp_file) return ht
def ld_score(entry_expr, locus_expr, radius, coord_expr=None, annotation_exprs=None, block_size=None) -> Table: """Calculate LD scores. Example ------- >>> # Load genetic data into MatrixTable >>> mt = hl.import_plink(bed='data/ldsc.bed', ... bim='data/ldsc.bim', ... fam='data/ldsc.fam') >>> # Create locus-keyed Table with numeric variant annotations >>> ht = hl.import_table('data/ldsc.annot', ... types={'BP': hl.tint, ... 'binary': hl.tfloat, ... 'continuous': hl.tfloat}) >>> ht = ht.annotate(locus=hl.locus(ht.CHR, ht.BP)) >>> ht = ht.key_by('locus') >>> # Annotate MatrixTable with external annotations >>> mt = mt.annotate_rows(binary_annotation=ht[mt.locus].binary, ... continuous_annotation=ht[mt.locus].continuous) >>> # Calculate LD scores using centimorgan coordinates >>> ht_scores = hl.experimental.ld_score(entry_expr=mt.GT.n_alt_alleles(), ... locus_expr=mt.locus, ... radius=1.0, ... coord_expr=mt.cm_position, ... annotation_exprs=[mt.binary_annotation, ... mt.continuous_annotation]) >>> # Show results >>> ht_scores.show(3) .. code-block:: text +---------------+-------------------+-----------------------+-------------+ | locus | binary_annotation | continuous_annotation | univariate | +---------------+-------------------+-----------------------+-------------+ | locus<GRCh37> | float64 | float64 | float64 | +---------------+-------------------+-----------------------+-------------+ | 20:82079 | 1.15183e+00 | 7.30145e+01 | 1.60117e+00 | | 20:103517 | 2.04604e+00 | 2.75392e+02 | 4.69239e+00 | | 20:108286 | 2.06585e+00 | 2.86453e+02 | 5.00124e+00 | +---------------+-------------------+-----------------------+-------------+ Warning ------- :func:`.ld_score` will fail if ``entry_expr`` results in any missing values. The special float value ``nan`` is not considered a missing value. **Further reading** For more in-depth discussion of LD scores, see: - `LD Score regression distinguishes confounding from polygenicity in genome-wide association studies (Bulik-Sullivan et al, 2015) <https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4495769/>`__ - `Partitioning heritability by functional annotation using genome-wide association summary statistics (Finucane et al, 2015) <https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4626285/>`__ Notes ----- `entry_expr`, `locus_expr`, `coord_expr` (if specified), and `annotation_exprs` (if specified) must come from the same MatrixTable. Parameters ---------- entry_expr : :class:`.NumericExpression` Expression for entries of genotype matrix (e.g. ``mt.GT.n_alt_alleles()``). locus_expr : :class:`.LocusExpression` Row-indexed locus expression. radius : :obj:`int` or :obj:`float` Radius of window for row values (in units of `coord_expr` if set, otherwise in units of basepairs). coord_expr: :class:`.Float64Expression`, optional Row-indexed numeric expression for the row value used to window variants. By default, the row value is given by the locus position. annotation_exprs : :class:`.NumericExpression` or :obj:`list` of :class:`.NumericExpression`, optional Annotation expression(s) to partition LD scores. Univariate annotation will always be included and does not need to be specified. block_size : :obj:`int`, optional Block size. Default given by :meth:`.BlockMatrix.default_block_size`. Returns ------- :class:`.Table` Table keyed by `locus_expr` with LD scores for each variant and `annotation_expr`. The function will always return LD scores for the univariate (all SNPs) annotation.""" mt = entry_expr._indices.source mt_locus_expr = locus_expr._indices.source if coord_expr is None: mt_coord_expr = mt_locus_expr else: mt_coord_expr = coord_expr._indices.source if not annotation_exprs: check_mts = all([mt == mt_locus_expr, mt == mt_coord_expr]) else: check_mts = all([mt == mt_locus_expr, mt == mt_coord_expr] + [mt == x._indices.source for x in wrap_to_list(annotation_exprs)]) if not check_mts: raise ValueError("""ld_score: entry_expr, locus_expr, coord_expr (if specified), and annotation_exprs (if specified) must come from same MatrixTable.""") n = mt.count_cols() r2 = hl.row_correlation(entry_expr, block_size) ** 2 r2_adj = ((n-1.0) / (n-2.0)) * r2 - (1.0 / (n-2.0)) starts, stops = hl.linalg.utils.locus_windows(locus_expr, radius, coord_expr) r2_adj_sparse = r2_adj.sparsify_row_intervals(starts, stops) r2_adj_sparse_tmp = new_temp_file() r2_adj_sparse.write(r2_adj_sparse_tmp) r2_adj_sparse = BlockMatrix.read(r2_adj_sparse_tmp) if not annotation_exprs: cols = ['univariate'] col_idxs = {0: 'univariate'} l2 = r2_adj_sparse.sum(axis=1) else: ht = mt.select_rows(*wrap_to_list(annotation_exprs)).rows() ht = ht.annotate(univariate=hl.literal(1.0)) names = [name for name in ht.row if name not in ht.key] ht_union = hl.Table.union( *[(ht.annotate(name=hl.str(x), value=hl.float(ht[x])) .select('name', 'value')) for x in names]) mt_annotations = ht_union.to_matrix_table( row_key=list(ht_union.key), col_key=['name']) cols = mt_annotations.key_cols_by()['name'].collect() col_idxs = {i: cols[i] for i in range(len(cols))} a_tmp = new_temp_file() BlockMatrix.write_from_entry_expr(mt_annotations.value, a_tmp) a = BlockMatrix.read(a_tmp) l2 = r2_adj_sparse @ a l2_bm_tmp = new_temp_file() l2_tsv_tmp = new_temp_file() l2.write(l2_bm_tmp, force_row_major=True) BlockMatrix.export(l2_bm_tmp, l2_tsv_tmp) ht_scores = hl.import_table(l2_tsv_tmp, no_header=True, impute=True) ht_scores = ht_scores.add_index() ht_scores = ht_scores.key_by('idx') ht_scores = ht_scores.rename({'f{:}'.format(i): col_idxs[i] for i in range(len(cols))}) ht = mt.select_rows(__locus=locus_expr).rows() ht = ht.add_index() ht = ht.annotate(**ht_scores[ht.idx]) ht = ht.key_by('__locus') ht = ht.select(*[x for x in ht_scores.row if x not in ht_scores.key]) ht = ht.rename({'__locus': 'locus'}) return ht
def value_irs(self): b = ir.TrueIR() c = ir.Ref('c') i = ir.I32(5) j = ir.I32(7) st = ir.Str('Hail') a = ir.Ref('a') aa = ir.Ref('aa') da = ir.Ref('da') v = ir.Ref('v') s = ir.Ref('s') t = ir.Ref('t') call = ir.Ref('call') table = ir.TableRange(5, 3) collect_sig = ir.AggSignature('Collect', [], None, [hl.tint32]) call_stats_sig = ir.AggSignature('CallStats', [], [hl.tint32], [hl.tcall]) hist_sig = ir.AggSignature( 'Histogram', [hl.tfloat64, hl.tfloat64, hl.tint32], None, [hl.tfloat64]) take_by_sig = ir.AggSignature('TakeBy', [hl.tint32], None, [hl.tfloat64, hl.tfloat64]) table = ir.TableRange(10, 4) value_irs = [ i, ir.I64(5), ir.F32(3.14), ir.F64(3.14), s, ir.TrueIR(), ir.FalseIR(), ir.Void(), ir.Cast(i, hl.tfloat64), ir.NA(hl.tint32), ir.IsNA(i), ir.If(b, i, j), ir.Let('v', i, v), ir.Ref('x'), ir.ApplyBinaryOp('+', i, j), ir.ApplyUnaryOp('-', i), ir.ApplyComparisonOp('EQ', i, j), ir.MakeArray([i, ir.NA(hl.tint32), ir.I32(-3)], hl.tarray(hl.tint32)), ir.ArrayRef(a, i), ir.ArrayLen(a), ir.ArrayRange(ir.I32(0), ir.I32(5), ir.I32(1)), ir.ArraySort(a, b, False), ir.ToSet(a), ir.ToDict(da), ir.ToArray(a), ir.LowerBoundOnOrderedCollection(a, i, True), ir.GroupByKey(da), ir.ArrayMap(a, 'v', v), ir.ArrayFilter(a, 'v', v), ir.ArrayFlatMap(aa, 'v', v), ir.ArrayFold(a, ir.I32(0), 'x', 'v', v), ir.ArrayScan(a, ir.I32(0), 'x', 'v', v), ir.ArrayFor(a, 'v', ir.Void()), ir.AggFilter(ir.TrueIR(), ir.I32(0)), ir.AggExplode(ir.ArrayRange(ir.I32(0), ir.I32(2), ir.I32(1)), 'x', ir.I32(0)), ir.AggGroupBy(ir.TrueIR(), ir.I32(0)), ir.ApplyAggOp([], None, [ir.I32(0)], collect_sig), ir.ApplyScanOp([], None, [ir.I32(0)], collect_sig), ir.ApplyAggOp([ir.F64(-5.0), ir.F64(5.0), ir.I32(100)], None, [ir.F64(-2.11)], hist_sig), ir.ApplyAggOp([], [ir.I32(2)], [call], call_stats_sig), ir.ApplyAggOp([ir.I32(10)], None, [ir.F64(-2.11), ir.F64(-2.11)], take_by_sig), ir.InitOp(ir.I32(0), [ir.I32(2)], call_stats_sig), ir.SeqOp(ir.I32(0), [i], collect_sig), ir.SeqOp(ir.I32(0), [ir.F64(-2.11), ir.I32(17)], take_by_sig), ir.Begin([ir.Void()]), ir.MakeStruct([('x', i)]), ir.SelectFields(s, ['x', 'z']), ir.InsertFields(s, [('x', i)]), ir.GetField(s, 'x'), ir.MakeTuple([i, b]), ir.GetTupleElement(t, 1), ir.StringSlice(st, ir.I32(1), ir.I32(2)), ir.StringLength(st), ir.In(2, hl.tfloat64), ir.Die('mumblefoo', hl.tfloat64), ir.Apply('&&', b, c), ir.Apply('toFloat64', i), ir.Uniroot('x', ir.F64(3.14), ir.F64(-5.0), ir.F64(5.0)), ir.Literal(hl.tarray(hl.tint32), [1, 2, None]), ir.TableCount(table), ir.TableAggregate(table, ir.MakeStruct([('foo', ir.ApplyAggOp([], None, [ir.I32(0)], collect_sig))])), ir.TableWrite(table, new_temp_file(), False, True, "fake_codec_spec$$"), ] return value_irs