def generate_cross_pop_ld_scores_from_ld_matrices(pop1, pop2, data_type, pop_data, min_frequency=0.01, call_rate_cutoff=0.8, adj: bool = False, radius: int = 1000000, overwrite=False, temp_bucket='gs://gnomad-tmp/ld'): n1 = pop_data.pop[pop1] n2 = pop_data.pop[pop2] ht1 = hl.read_table(ld_resources._ld_index_path(data_type, pop1, adj=adj)) ht1 = ht1.filter((ht1.pop_freq.AF >= min_frequency) & (ht1.pop_freq.AF <= 1 - min_frequency) & (ht1.pop_freq.AN / n1 >= 2 * call_rate_cutoff)) ht2 = hl.read_table(ld_resources._ld_index_path(data_type, pop2, adj=adj)) ht2 = ht2.filter((ht2.pop_freq.AF >= min_frequency) & (ht2.pop_freq.AF <= 1 - min_frequency) & (ht2.pop_freq.AN / n2 >= 2 * call_rate_cutoff)) ht1 = ht1.filter(hl.is_defined(ht2[ht1.key])).add_index(name='new_idx').checkpoint(f'{temp_bucket}/{pop1}_{pop2}.ht', overwrite=overwrite, _read_if_exists=not overwrite) ht2 = ht2.filter(hl.is_defined(ht1[ht2.key])).add_index(name='new_idx').checkpoint(f'{temp_bucket}/{pop2}_{pop1}.ht', overwrite=overwrite, _read_if_exists=not overwrite) indices1 = ht1.idx.collect() indices2 = ht2.idx.collect() assert len(indices1) == len(indices2) r1 = BlockMatrix.read(ld_resources._ld_matrix_path(data_type, pop1, min_frequency >= COMMON_FREQ, adj=adj)).filter(indices1, indices1) r2 = BlockMatrix.read(ld_resources._ld_matrix_path(data_type, pop2, min_frequency >= COMMON_FREQ, adj=adj)).filter(indices2, indices2) r_bm = r1 * r2 # TODO: is a bias adjustment needed? # r2_adj = ((n - 1.0) / (n - 2.0)) * r2 - (1.0 / (n - 2.0)) out_name = ld_resources._cross_pop_ld_scores_path(data_type, pop1, pop2, adj) compute_and_annotate_ld_score(ht1, r_bm, radius, out_name, overwrite)
def plot_correlation_matrices(chr_list): """ Plot combined correlation matrices for genotype-correlation and sumstats-correlation matrices """ for ch in chr_list: ss_ch = BlockMatrix.read('gs://nbaya/sumstats_corr/' + variant_set + '_ss_correlation_chr{}.bm/'.format(ch)) gt_ch = BlockMatrix.read('gs://nbaya/sumstats_corr/' + variant_set + '_gt_correlation_chr{}.bm/'.format(ch)) M_max = int( 1e4 ) #max number of variants to be taken from the block matrices (suggested: 2e4) M = ss_ch.shape[0] #dimension of block matrix # for idx in range(int(M/M_max)+1): #index of which disjoint window we are looking at in the block matrix for idx in range( 0, int(M / M_max) + 1 ): #index of which disjoint window we are looking at in the block matrix M0 = M_max * (idx) #start variant index for block matrix filtering M1 = min(M_max * (idx + 1), M) #stop variant index for block matrix filtering ss_np = ss_ch[M0:M1, M0:M1].to_numpy() gt_np = gt_ch[M0:M1, M0:M1].to_numpy() print('\nStarting variant window: [' + str(M0) + ',' + str(M1) + ']') w = int( 5e3 ) #window width of variants for correlation matrix (suggested: 2e3) for i in range(int((M1 - M0 - 1) / w) + 1): w0 = w * i #start variant index for window of correlation matrix w1 = min( w * (i + 1), M1 - M0) #stop variant index for window of correlation matrix full = (ss_np[w0:w1, w0:w1] + gt_np[w0:w1, w0:w1].T) np.fill_diagonal(full, 1) fig, ax = plt.subplots() ax.imshow(full, cmap='bwr') ax.plot([0, w], [0, w], 'k--', alpha=0.5, lw=2) plt.xlim([0, w]) plt.ylim([w, 0]) ax.text(w * 0.83, w * 0.1, "SS", fontsize=60, alpha=0.5) ax.text(w * 0.02, w * 0.97, "GT", fontsize=60, alpha=0.5) plt.title('chr' + str(ch) + ' ' + variant_set + ' variants (' + str(M0 + w0) + '-' + str(M0 + w1) + ')') fig = plt.gcf() fig.set_size_inches(10, 10) path = ('gs://nbaya/sumstats_corr/plots/chr' + str(ch) + '_' + variant_set + '_' + str(M0 + w0).zfill(len(str(M))) + '-' + str(M0 + w1).zfill(len(str(M))) + '.png') with hl.hadoop_open(path, 'wb') as f: fig.savefig(f, dpi=600) plt.close() print('\nFinished variant window: [' + str(M0) + ',' + str(M1) + ']')
def tree_matmul_tree_matsum(bm1, bm2, mul_splits: int, sum_splits: int = None, path_prefix: str = None, read_if_exists=False): r''' Version of tree_matmul() that allows for intermediate sums of matrix multiplication. `sum_splits` must be a divisor of `mul_splits` ''' # TODO: Make a private function that acts recursively to ensure that the # matrix sums never include more than a maximum number of matrices assert mul_splits % sum_splits == 0, '`sum_splits` must be a divisor of `mul_splits' if not read_if_exists: print(bm1._n_block_cols) print(mul_splits) inner_brange_size = int(math.ceil(bm1._n_block_cols / mul_splits)) print(f'inner_brange_size: {inner_brange_size}') split_points = list(range(0, bm1._n_block_cols, inner_brange_size)) + [bm1._n_block_cols] print(split_points) inner_ranges = list(zip(split_points[:-1], split_points[1:])) print(f'len(inner_ranges): {len(inner_ranges)}') blocks_to_multiply = [(bm1._select_blocks((0, bm1._n_block_rows), (start, stop)), bm2._select_blocks((start, stop), (0, bm2._n_block_cols))) for start, stop in inner_ranges] intermediate_multiply_exprs = [ b1 @ b2 for b1, b2 in blocks_to_multiply ] print(len(intermediate_multiply_exprs)) print(f'Writing {mul_splits} intermediate matrices to {path_prefix}') hl.experimental.write_block_matrices(intermediate_multiply_exprs, path_prefix) read_intermediates = [ BlockMatrix.read(f"{path_prefix}_{i}") for i in range(0, mul_splits) ] tracked_partial_sums = [] sum_block_size = math.ceil(mul_splits / sum_splits) for i in range(sum_splits): partial_sum_path = f"{path_prefix}-partial-{i}" sum(read_intermediates[i * sum_block_size:(i + 1) * sum_block_size]).write(partial_sum_path, overwrite=True) tracked_partial_sums.append(BlockMatrix.read(partial_sum_path)) return sum(tracked_partial_sums)
def test_stage_locally(self): nd = np.arange(0, 80, dtype=float).reshape(8, 10) bm_uri = new_temp_file() BlockMatrix.from_numpy(nd, block_size=3).write(bm_uri, stage_locally=True) bm = BlockMatrix.read(bm_uri) self._assert_eq(nd, bm)
def test_stage_locally(self): nd = np.arange(0, 80, dtype=float).reshape(8, 10) bm_uri = new_temp_file() BlockMatrix.from_numpy(nd, block_size=3).write(bm_uri, stage_locally=True) bm = BlockMatrix.read(bm_uri) self._assert_eq(nd, bm)
def test_stage_locally(self): nd = np.arange(0, 80, dtype=float).reshape(8, 10) with hl.TemporaryDirectory(ensure_exists=False) as bm_uri: BlockMatrix.from_numpy(nd, block_size=3).write(bm_uri, stage_locally=True) bm = BlockMatrix.read(bm_uri) self._assert_eq(nd, bm)
def generate_ld_scores_from_ld_matrix(pop_data, data_type, min_frequency=0.01, call_rate_cutoff=0.8, adj: bool = False, radius: int = 1000000, overwrite=False): # This function required a decent number of high-mem machines (with an SSD for good measure) to complete the AFR # For the rest, on 20 n1-standard-8's, 1h15m to export block matrix, 15 mins to compute LD scores per population (~$150 total) for label, pops in dict(pop_data).items(): for pop, n in pops.items(): ht = hl.read_table( ld_resources._ld_index_path(data_type, pop, adj=adj)) ht = ht.filter((ht.pop_freq.AF >= min_frequency) & (ht.pop_freq.AF <= 1 - min_frequency) & (ht.pop_freq.AN / n >= 2 * call_rate_cutoff)).add_index(name='new_idx') indices = ht.idx.collect() r2 = BlockMatrix.read( ld_resources._ld_matrix_path(data_type, pop, min_frequency >= COMMON_FREQ, adj=adj)) r2 = r2.filter(indices, indices)**2 r2_adj = ((n - 1.0) / (n - 2.0)) * r2 - (1.0 / (n - 2.0)) out_name = ld_resources._ld_scores_path(data_type, pop, adj) compute_and_annotate_ld_score(ht, r2_adj, radius, out_name, overwrite)
def bm(self) -> BlockMatrix: """ Read and return the Hail MatrixTable resource. :return: Hail MatrixTable resource """ return BlockMatrix.read(self.path)
def test_write_overwrite(self): with hl.TemporaryDirectory(ensure_exists=False) as path: bm = BlockMatrix.from_numpy(np.array([[0]])) bm.write(path) self.assertRaises(FatalError, lambda: bm.write(path)) bm2 = BlockMatrix.from_numpy(np.array([[1]])) bm2.write(path, overwrite=True) self._assert_eq(BlockMatrix.read(path), bm2)
def generate_ld_scores_from_ld_matrix(pop_data, data_type, min_frequency=0.01, call_rate_cutoff=0.8, adj: bool = False, radius: int = 1000000, overwrite=False): # This function required a decent number of high-mem machines (with an SSD for good measure) to complete the AFR # For the rest, on 20 n1-standard-8's, 1h15m to export block matrix, 15 mins to compute LD scores per population (~$150 total) for label, pops in dict(pop_data).items(): for pop, n in pops.items(): if pop in ('nfe', 'fin', 'asj'): continue ht = hl.read_table(ld_index_path(data_type, pop, adj=adj)) ht = ht.filter((ht.pop_freq.AF >= min_frequency) & (ht.pop_freq.AF <= 1 - min_frequency) & (ht.pop_freq.AN / n >= 2 * call_rate_cutoff)).add_index(name='new_idx') indices = ht.idx.collect() r2 = BlockMatrix.read( ld_matrix_path(data_type, pop, min_frequency >= COMMON_FREQ, adj=adj)) r2 = r2.filter(indices, indices)**2 r2_adj = ((n - 1.0) / (n - 2.0)) * r2 - (1.0 / (n - 2.0)) starts_and_stops = hl.linalg.utils.locus_windows(ht.locus, radius, _localize=False) # Lifted directly from https://github.com/hail-is/hail/blob/555e02d6c792263db2c3ed97db8002b489e2dacb/hail/python/hail/methods/statgen.py#L2595 # for the time being, until efficient BlockMatrix filtering gets an easier interface r2_adj = BlockMatrix._from_java( r2_adj._jbm.filterRowIntervalsIR( Env.backend()._to_java_ir(starts_and_stops._ir), False)) l2row = r2_adj.sum(axis=0).T l2col = r2_adj.sum(axis=1) l2 = l2row + l2col + 1 l2_bm_tmp = new_temp_file() l2_tsv_tmp = new_temp_file() l2.write(l2_bm_tmp, force_row_major=True) BlockMatrix.export(l2_bm_tmp, l2_tsv_tmp) ht_scores = hl.import_table(l2_tsv_tmp, no_header=True, impute=True) ht_scores = ht_scores.add_index().rename({'f0': 'ld_score'}) ht_scores = ht_scores.key_by('idx') ht = ht.annotate(**ht_scores[ht.new_idx]).select_globals() ht.filter(hl.is_defined(ht.ld_score)).write( ld_scores_path(data_type, pop, adj), overwrite)
def test_write_from_entry_expr_overwrite(self): mt = hl.balding_nichols_model(1, 1, 1) mt = mt.select_entries(x=mt.GT.n_alt_alleles()) bm = BlockMatrix.from_entry_expr(mt.x) path = new_temp_file() BlockMatrix.write_from_entry_expr(mt.x, path) self.assertRaises(FatalError, lambda: BlockMatrix.write_from_entry_expr(mt.x, path)) BlockMatrix.write_from_entry_expr(mt.x, path, overwrite=True) self._assert_eq(BlockMatrix.read(path), bm) # non-field expressions currently take a separate code path path2 = new_temp_file() BlockMatrix.write_from_entry_expr(mt.x + 1, path2) self.assertRaises(FatalError, lambda: BlockMatrix.write_from_entry_expr(mt.x + 1, path2)) BlockMatrix.write_from_entry_expr(mt.x + 2, path2, overwrite=True) self._assert_eq(BlockMatrix.read(path2), bm + 2)
def test_write_from_entry_expr_overwrite(self): mt = hl.balding_nichols_model(1, 1, 1) mt = mt.select_entries(x=mt.GT.n_alt_alleles()) bm = BlockMatrix.from_entry_expr(mt.x) path = new_temp_file() BlockMatrix.write_from_entry_expr(mt.x, path) self.assertRaises(FatalError, lambda: BlockMatrix.write_from_entry_expr(mt.x, path)) BlockMatrix.write_from_entry_expr(mt.x, path, overwrite=True) self._assert_eq(BlockMatrix.read(path), bm) # non-field expressions currently take a separate code path path2 = new_temp_file() BlockMatrix.write_from_entry_expr(mt.x + 1, path2) self.assertRaises(FatalError, lambda: BlockMatrix.write_from_entry_expr(mt.x + 1, path2)) BlockMatrix.write_from_entry_expr(mt.x + 2, path2, overwrite=True) self._assert_eq(BlockMatrix.read(path2), bm + 2)
def test_write_overwrite(self): path = new_temp_file() bm = BlockMatrix.from_numpy(np.array([[0]])) bm.write(path) self.assertRaises(FatalError, lambda: bm.write(path)) bm2 = BlockMatrix.from_numpy(np.array([[1]])) bm2.write(path, overwrite=True) self._assert_eq(BlockMatrix.read(path), bm2)
def test_write_from_entry_expr_overwrite(self): mt = hl.balding_nichols_model(1, 1, 1) mt = mt.select_entries(x=mt.GT.n_alt_alleles()) bm = BlockMatrix.from_entry_expr(mt.x) with hl.TemporaryDirectory(ensure_exists=False) as path: BlockMatrix.write_from_entry_expr(mt.x, path) self.assertRaises(FatalError, lambda: BlockMatrix.write_from_entry_expr(mt.x, path)) BlockMatrix.write_from_entry_expr(mt.x, path, overwrite=True) self._assert_eq(BlockMatrix.read(path), bm) with hl.TemporaryDirectory(ensure_exists=False) as path: # non-field expressions currently take a separate code path BlockMatrix.write_from_entry_expr(mt.x + 1, path) self.assertRaises(FatalError, lambda: BlockMatrix.write_from_entry_expr(mt.x + 1, path)) BlockMatrix.write_from_entry_expr(mt.x + 2, path, overwrite=True) self._assert_eq(BlockMatrix.read(path), bm + 2)
def test_write_overwrite(self): path = new_temp_file() bm = BlockMatrix.from_numpy(np.array([[0]])) bm.write(path) self.assertRaises(FatalError, lambda: bm.write(path)) bm2 = BlockMatrix.from_numpy(np.array([[1]])) bm2.write(path, overwrite=True) self._assert_eq(BlockMatrix.read(path), bm2)
def test_from_entry_expr(self): mt = get_dataset() mt = mt.annotate_entries(x=hl.or_else(mt.GT.n_alt_alleles(), 0)).cache() a1 = BlockMatrix.from_entry_expr(hl.or_else(mt.GT.n_alt_alleles(), 0), block_size=32).to_numpy() a2 = BlockMatrix.from_entry_expr(mt.x, block_size=32).to_numpy() a3 = BlockMatrix.from_entry_expr(hl.float64(mt.x), block_size=32).to_numpy() self._assert_eq(a1, a2) self._assert_eq(a1, a3) path = new_temp_file() BlockMatrix.write_from_entry_expr(mt.x, path, block_size=32) a4 = BlockMatrix.read(path).to_numpy() self._assert_eq(a1, a4)
def test_from_entry_expr(self): mt = get_dataset() mt = mt.annotate_entries(x=hl.or_else(mt.GT.n_alt_alleles(), 0)).cache() a1 = BlockMatrix.from_entry_expr(hl.or_else(mt.GT.n_alt_alleles(), 0), block_size=32).to_numpy() a2 = BlockMatrix.from_entry_expr(mt.x, block_size=32).to_numpy() a3 = BlockMatrix.from_entry_expr(hl.float64(mt.x), block_size=32).to_numpy() self._assert_eq(a1, a2) self._assert_eq(a1, a3) path = new_temp_file() BlockMatrix.write_from_entry_expr(mt.x, path, block_size=32) a4 = BlockMatrix.read(path).to_numpy() self._assert_eq(a1, a4)
def test_from_entry_expr(self): mt = get_dataset() mt = mt.annotate_entries(x=hl.or_else(mt.GT.n_alt_alleles(), 0)).cache() a1 = BlockMatrix.from_entry_expr(hl.or_else(mt.GT.n_alt_alleles(), 0), block_size=32).to_numpy() a2 = BlockMatrix.from_entry_expr(mt.x, block_size=32).to_numpy() a3 = BlockMatrix.from_entry_expr(hl.float64(mt.x), block_size=32).to_numpy() self._assert_eq(a1, a2) self._assert_eq(a1, a3) with hl.TemporaryDirectory(ensure_exists=False) as path: BlockMatrix.write_from_entry_expr(mt.x, path, block_size=32) a4 = BlockMatrix.read(path).to_numpy() self._assert_eq(a1, a4)
def export_snv_sv_ld_matrix(pop_data, data_type, common_only: bool = True, adj: bool = False, overwrite: bool = False): for label, pops in dict(pop_data).items(): for pop in pops: if pop not in SNV_SV_POPS: continue bm = BlockMatrix.read(ld_resources._ld_matrix_path(data_type, pop, common_only, adj)) ld_index = hl.read_table(ld_resources._ld_index_path(data_type, pop, common_only, adj)) snvs = ld_index.filter(ld_index.alleles[0] != "N") svs = ld_index.filter(ld_index.alleles[0] == "N") snv_indices = snvs.idx.collect() sv_indices = svs.idx.collect() ht = bm.filter(snv_indices, sv_indices).entries(keyed=False) ht.filter(ht.entry != 0).write(ld_resources._ld_snv_sv_path(pop), overwrite) hl.read_table(ld_resources._ld_snv_sv_path(pop)).export(ld_resources._ld_snv_sv_path(pop).replace('.ht', '.txt.bgz')) snvs = snvs.add_index().key_by() svs = svs.add_index().key_by() snvs.select(chrom=snvs.locus.contig, pos=snvs.locus.position, ref=snvs.alleles[0], alt=snvs.alleles[1], i=snvs.idx).export(ld_resources._ld_snv_sv_index_path(pop, 'snv')) svs.select(chrom=svs.locus.contig, pos=svs.locus.position, ref=svs.alleles[0], alt=svs.alleles[1], j=svs.idx).export(ld_resources._ld_snv_sv_index_path(pop, 'sv'))
def get_ref_X(ref_panel, overwrite=False): r''' Returns N_ref x M dim matrix of column-standardized genotypes of LD ref panel ''' X_bm_path = f'{bucket}/{ref_panel}.X.bm' if overwrite or not hl.hadoop_is_file(f'{X_bm_path}/_SUCCESS'): mt = hl.import_plink(bed=f'{bucket}/{ref_panel}.bed', bim=f'{bucket}/{ref_panel}.bim', fam=f'{bucket}/{ref_panel}.fam') mt = mt.annotate_rows(stats=hl.agg.stats(mt.GT.n_alt_alleles())) mt = mt.annotate_entries(X=(mt.GT.n_alt_alleles() - mt.stats.mean) / mt.stats.stdev) X = BlockMatrix.from_entry_expr(mt.X) X = X.T X.write(f'{bucket}/{ref_panel}.X.bm', overwrite=True) X = BlockMatrix.read(X_bm_path) return X
def from_random_effects(cls, y, x, z, p_path=None, overwrite=False, max_condition_number=1e-10, complexity_bound=8192): r"""Initializes a model from :math:`y`, :math:`X`, and :math:`Z`. Examples -------- >>> from hail.stats import LinearMixedModel >>> y = np.array([0.0, 1.0, 8.0, 9.0]) >>> x = np.array([[1.0, 0.0], ... [1.0, 2.0], ... [1.0, 1.0], ... [1.0, 4.0]]) >>> z = np.array([[0.0, 0.0, 1.0], ... [0.0, 1.0, 2.0], ... [1.0, 2.0, 4.0], ... [2.0, 4.0, 8.0]]) >>> model, p = LinearMixedModel.from_random_effects(y, x, z) >>> model.fit() >>> model.h_sq 0.38205307244271675 Notes ----- If :math:`n \leq m`, the returned model is full rank. If :math:`n > m`, the returned model is low rank. In this case only, eigenvalues less than or equal to `max_condition_number` times the top eigenvalue are dropped from :math:`S`, with the corresponding eigenvectors dropped from :math:`P`. This guards against precision loss on left eigenvectors computed via the right gramian :math:`Z^T Z` in :meth:`BlockMatrix.svd`. In either case, one can truncate to a rank :math:`r` model as follows. If `p` is an ndarray: >>> p_r = p[:r, :] # doctest: +SKIP >>> s_r = model.s[:r] # doctest: +SKIP >>> model_r = LinearMixedModel(p_r @ y, p_r @ x, s_r, y, x) # doctest: +SKIP If `p` is a block matrix: >>> p[:r, :].write(p_r_path) # doctest: +SKIP >>> p_r = BlockMatrix.read(p_r_path) # doctest: +SKIP >>> s_r = model.s[:r] # doctest: +SKIP >>> model_r = LinearMixedModel(p_r @ y, p_r @ x, s_r, y, x, p_r_path) # doctest: +SKIP This method applies no standardization to `z`. Warning ------- If `z` is a block matrix, then ideally `z` should be the result of directly reading from disk (and possibly a transpose). This is most critical if :math:`n > m`, because in this case multiplication by `z` will result in all preceding transformations being repeated ``n / block_size`` times, as explained in :class:`.BlockMatrix`. At least one dimension must be less than or equal to 46300. See the warning in :meth:`.BlockMatrix.svd` for performance considerations. Parameters ---------- y: :class:`ndarray` :math:`n` vector of observations :math:`y`. x: :class:`ndarray` :math:`n \times p` matrix of fixed effects :math:`X`. z: :class:`ndarray` or :class:`BlockMatrix` :math:`n \times m` matrix of random effects :math:`Z`. p_path: :obj:`str`, optional Path at which to write :math:`P` as a block matrix. Required if `z` is a block matrix. overwrite: :obj:`bool` If ``True``, overwrite an existing file at `p_path`. max_condition_number: :obj:`float` Maximum condition number. Must be greater than 1e-16. complexity_bound: :obj:`int` Complexity bound for :meth:`.BlockMatrix.svd` when `z` is a block matrix. Returns ------- model: :class:`LinearMixedModel` Model constructed from :math:`y`, :math:`X`, and :math:`Z`. p: :class:`ndarray` or :class:`.BlockMatrix` Matrix :math:`P` whose rows are the eigenvectors of :math:`K`. The type is block matrix if `z` is a block matrix and :meth:`.BlockMatrix.svd` of `z` returns :math:`U` as a block matrix. """ z_is_bm = isinstance(z, BlockMatrix) if z_is_bm and p_path is None: raise ValueError("from_random_effects: 'p_path' required when 'z'" "is a block matrix.") if max_condition_number < 1e-16: raise ValueError("from_random_effects: 'max_condition_number' must " f"be at least 1e-16, found {max_condition_number}") _check_dims(y, "y", 1) _check_dims(x, "x", 2) _check_dims(z, "z", 2) n, m = z.shape if y.shape[0] != n: raise ValueError("from_random_effects: 'y' and 'z' must have the " "same number of rows") if x.shape[0] != n: raise ValueError("from_random_effects: 'x' and 'z' must have the " "same number of rows") if z_is_bm: u, s0, _ = z.svd(complexity_bound=complexity_bound) p = u.T p_is_bm = isinstance(p, BlockMatrix) else: u, s0, _ = hl.linalg._svd(z, full_matrices=False) p = u.T p_is_bm = False s = s0 ** 2 low_rank = n > m if low_rank: assert np.all(np.isfinite(s)) r = np.searchsorted(-s, -max_condition_number * s[0]) if r < m: info(f'from_random_effects: model rank reduced from {m} to {r} ' f'due to ill-condition.' f'\n Largest dropped eigenvalue was {s[r]}.') s = s[:r] p = p[:r, :] if p_path is not None: if p_is_bm: p.write(p_path, overwrite=overwrite) p = BlockMatrix.read(p_path) else: BlockMatrix.from_numpy(p).write(p_path, overwrite=overwrite) if p_is_bm: py, px = (p @ y).to_numpy(), (p @ x).to_numpy() else: py, px = p @ y, p @ x if low_rank: model = LinearMixedModel(py, px, s, y, x, p_path) else: model = LinearMixedModel(py, px, s, p_path=p_path) return model, p
def main(args): ht_snp = hl.import_table(args.snp, impute=True) ht_snp = ht_snp.annotate(variant=hl.delimit([ ht_snp.chromosome, hl.str(ht_snp.position), ht_snp.allele1, ht_snp.allele2 ], delimiter=':')) ht_snp = ht_snp.annotate( **hl.parse_variant(ht_snp.variant, reference_genome='GRCh38')) ht_snp = ht_snp.key_by('locus', 'alleles') ht_snp = ht_snp.add_index('idx_snp') ht_snp = ht_snp.checkpoint(new_temp_file()) # annotate vep gnomad = hl.read_table( 'gs://gnomad-public-requester-pays/release/3.0/ht/genomes/gnomad.genomes.r3.0.sites.ht' ) ht_snp = ht_snp.join(gnomad.select('vep'), how='left') ht_snp = process_consequences(ht_snp) # extract most severe ht_snp = ht_snp.annotate(vep=(hl.case().when( hl.is_defined(ht_snp.vep.worst_csq_for_variant_canonical), ht_snp.vep.worst_csq_for_variant_canonical).when( hl.is_defined(ht_snp.vep.worst_csq_for_variant), ht_snp.vep.worst_csq_for_variant).or_missing()), is_canonical_vep=hl.is_defined( ht_snp.vep.worst_csq_for_variant_canonical)) ht_snp = ht_snp.annotate(most_severe=hl.if_else( hl.is_defined(ht_snp.vep), ht_snp.vep.most_severe_consequence, 'intergenic_variant'), gene_most_severe=ht_snp.vep.gene_symbol) ht_snp = ht_snp.select_globals() ht_snp = ht_snp.drop('vep') ht_snp = ht_snp.annotate( **annotate_consequence_category(ht_snp.most_severe)) ht_snp = ht_snp.checkpoint(new_temp_file()) df = ht_snp.key_by().drop('locus', 'alleles', 'variant', 'idx_snp').to_pandas() # annotate LD for pop in POPS: ht = hl.read_table( f'gs://gnomad-public-requester-pays/release/2.1.1/ld/gnomad.genomes.r2.1.1.{pop}.common.adj.ld.variant_indices.ht' ) ht = ht.annotate(locus_hg38=hl.liftover(ht.locus, 'GRCh38')) ht = ht.filter(hl.is_defined(ht.locus_hg38)) ht = ht.key_by('locus_hg38', 'alleles').drop('locus') ht = ht_snp.join(ht, 'inner') ht = ht.checkpoint(new_temp_file()) lead_idx = ht.order_by(hl.desc(ht.prob)).head(1).idx.collect() idx = ht.idx.collect() bm = BlockMatrix.read( f'gs://gnomad-public-requester-pays/release/2.1.1/ld/gnomad.genomes.r2.1.1.{pop}.common.ld.bm' ) bm = bm.filter(idx, idx) # re-densify triangluar matrix bm = bm + bm.T - get_diag_mat(bm.diagonal()) bm = bm.filter_rows( np.where(np.array(idx) == lead_idx[0])[0].tolist())**2 idx_snp = ht.idx_snp.collect() r2 = bm.to_numpy()[0] df[f'gnomad_lead_r2_{pop}'] = np.nan df[f'gnomad_lead_r2_{pop}'].iloc[idx_snp] = r2 if args.out.startswith('gs://'): fopen = hl.hadoop_open else: fopen = open with fopen(args.out, 'w') as f: df.to_csv(f, sep='\t', na_rep='NA', index=False)
def ld_score(entry_expr, locus_expr, radius, coord_expr=None, annotation_exprs=None, block_size=None) -> Table: """Calculate LD scores. Example ------- >>> # Load genetic data into MatrixTable >>> mt = hl.import_plink(bed='data/ldsc.bed', ... bim='data/ldsc.bim', ... fam='data/ldsc.fam') >>> # Create locus-keyed Table with numeric variant annotations >>> ht = hl.import_table('data/ldsc.annot', ... types={'BP': hl.tint, ... 'binary': hl.tfloat, ... 'continuous': hl.tfloat}) >>> ht = ht.annotate(locus=hl.locus(ht.CHR, ht.BP)) >>> ht = ht.key_by('locus') >>> # Annotate MatrixTable with external annotations >>> mt = mt.annotate_rows(binary_annotation=ht[mt.locus].binary, ... continuous_annotation=ht[mt.locus].continuous) >>> # Calculate LD scores using centimorgan coordinates >>> ht_scores = hl.experimental.ld_score(entry_expr=mt.GT.n_alt_alleles(), ... locus_expr=mt.locus, ... radius=1.0, ... coord_expr=mt.cm_position, ... annotation_exprs=[mt.binary_annotation, ... mt.continuous_annotation]) >>> # Show results >>> ht_scores.show(3) .. code-block:: text +---------------+-------------------+-----------------------+-------------+ | locus | binary_annotation | continuous_annotation | univariate | +---------------+-------------------+-----------------------+-------------+ | locus<GRCh37> | float64 | float64 | float64 | +---------------+-------------------+-----------------------+-------------+ | 20:82079 | 1.15183e+00 | 7.30145e+01 | 1.60117e+00 | | 20:103517 | 2.04604e+00 | 2.75392e+02 | 4.69239e+00 | | 20:108286 | 2.06585e+00 | 2.86453e+02 | 5.00124e+00 | +---------------+-------------------+-----------------------+-------------+ Warning ------- :func:`.ld_score` will fail if ``entry_expr`` results in any missing values. The special float value ``nan`` is not considered a missing value. **Further reading** For more in-depth discussion of LD scores, see: - `LD Score regression distinguishes confounding from polygenicity in genome-wide association studies (Bulik-Sullivan et al, 2015) <https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4495769/>`__ - `Partitioning heritability by functional annotation using genome-wide association summary statistics (Finucane et al, 2015) <https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4626285/>`__ Notes ----- `entry_expr`, `locus_expr`, `coord_expr` (if specified), and `annotation_exprs` (if specified) must come from the same MatrixTable. Parameters ---------- entry_expr : :class:`.NumericExpression` Expression for entries of genotype matrix (e.g. ``mt.GT.n_alt_alleles()``). locus_expr : :class:`.LocusExpression` Row-indexed locus expression. radius : :obj:`int` or :obj:`float` Radius of window for row values (in units of `coord_expr` if set, otherwise in units of basepairs). coord_expr: :class:`.Float64Expression`, optional Row-indexed numeric expression for the row value used to window variants. By default, the row value is given by the locus position. annotation_exprs : :class:`.NumericExpression` or :obj:`list` of :class:`.NumericExpression`, optional Annotation expression(s) to partition LD scores. Univariate annotation will always be included and does not need to be specified. block_size : :obj:`int`, optional Block size. Default given by :meth:`.BlockMatrix.default_block_size`. Returns ------- :class:`.Table` Table keyed by `locus_expr` with LD scores for each variant and `annotation_expr`. The function will always return LD scores for the univariate (all SNPs) annotation.""" mt = entry_expr._indices.source mt_locus_expr = locus_expr._indices.source if coord_expr is None: mt_coord_expr = mt_locus_expr else: mt_coord_expr = coord_expr._indices.source if not annotation_exprs: check_mts = all([mt == mt_locus_expr, mt == mt_coord_expr]) else: check_mts = all([mt == mt_locus_expr, mt == mt_coord_expr] + [mt == x._indices.source for x in wrap_to_list(annotation_exprs)]) if not check_mts: raise ValueError("""ld_score: entry_expr, locus_expr, coord_expr (if specified), and annotation_exprs (if specified) must come from same MatrixTable.""") n = mt.count_cols() r2 = hl.row_correlation(entry_expr, block_size) ** 2 r2_adj = ((n - 1.0) / (n - 2.0)) * r2 - (1.0 / (n - 2.0)) starts, stops = hl.linalg.utils.locus_windows(locus_expr, radius, coord_expr) r2_adj_sparse = r2_adj.sparsify_row_intervals(starts, stops) r2_adj_sparse_tmp = new_temp_file() r2_adj_sparse.write(r2_adj_sparse_tmp) r2_adj_sparse = BlockMatrix.read(r2_adj_sparse_tmp) if not annotation_exprs: cols = ['univariate'] col_idxs = {0: 'univariate'} l2 = r2_adj_sparse.sum(axis=1) else: ht = mt.select_rows(*wrap_to_list(annotation_exprs)).rows() ht = ht.annotate(univariate=hl.literal(1.0)) names = [name for name in ht.row if name not in ht.key] ht_union = hl.Table.union( *[(ht.annotate(name=hl.str(x), value=hl.float(ht[x])) .select('name', 'value')) for x in names]) mt_annotations = ht_union.to_matrix_table( row_key=list(ht_union.key), col_key=['name']) cols = mt_annotations.key_cols_by()['name'].collect() col_idxs = {i: cols[i] for i in range(len(cols))} a_tmp = new_temp_file() BlockMatrix.write_from_entry_expr(mt_annotations.value, a_tmp) a = BlockMatrix.read(a_tmp) l2 = r2_adj_sparse @ a l2_bm_tmp = new_temp_file() l2_tsv_tmp = new_temp_file() l2.write(l2_bm_tmp, force_row_major=True) BlockMatrix.export(l2_bm_tmp, l2_tsv_tmp) ht_scores = hl.import_table(l2_tsv_tmp, no_header=True, impute=True) ht_scores = ht_scores.add_index() ht_scores = ht_scores.key_by('idx') ht_scores = ht_scores.rename({'f{:}'.format(i): col_idxs[i] for i in range(len(cols))}) ht = mt.select_rows(__locus=locus_expr).rows() ht = ht.add_index() ht = ht.annotate(**ht_scores[ht.idx]) ht = ht.key_by('__locus') ht = ht.select(*[x for x in ht_scores.row if x not in ht_scores.key]) ht = ht.rename({'__locus': 'locus'}) return ht
def __init__(self, py, px, s, y=None, x=None, p_path=None): if y is None and x is None: low_rank = False elif y is not None and x is not None: low_rank = True else: raise ValueError('for low-rank, set both y and x; for full-rank, do not set y or x.') _check_dims(py, 'py', 1) _check_dims(px, 'px', 2) _check_dims(s, 's', 1) r = s.size f = px.shape[1] if py.size != r: raise ValueError("py and s must have the same size") if px.shape[0] != r: raise ValueError("px must have the same number of rows as the size of s") if low_rank: _check_dims(y, 'y', 1) _check_dims(x, 'x', 2) n = y.size if n <= r: raise ValueError("size of y must be larger than the size of s") if x.shape[0] != n: raise ValueError("x must have the same number of rows as the size of y") if x.shape[1] != f: raise ValueError("px and x must have the same number columns") else: n = r if p_path is not None: n_rows, n_cols = BlockMatrix.read(p_path).shape if n_cols != n: raise ValueError("LinearMixedModel: Number of columns in the block " f"matrix at 'p_path' ({n_cols}) must equal " f"the size of 'y' ({n})") if n_rows != r: raise ValueError("LinearMixedModel: Number of rows in the block " f"matrix at 'p_path' ({n_rows}) must equal " f"the size of 'py' ({r})") self.low_rank = low_rank self.n = n self.f = f self.r = r self.py = py self.px = px self.s = s self.y = y self.x = x self.p_path = p_path self._check_dof() self.beta = None self.sigma_sq = None self.tau_sq = None self.gamma = None self.log_gamma = None self.h_sq = None self.h_sq_standard_error = None self.optimize_result = None self._fitted = False if low_rank: self._yty = y @ y self._xty = x.T @ y self._xtx = x.T @ x self._dof = n - f self._d = None self._ydy = None self._xdy = None self._xdx = None self._dof_alt = n - (f + 1) self._d_alt = None self._ydy_alt = None self._xdy_alt = np.zeros(f + 1) self._xdx_alt = np.zeros((f + 1, f + 1)) self._residual_sq = None self._scala_model = None
def _test_linear_mixed_model_low_rank(self): seed = 0 n_populations = 8 fst = n_populations * [.9] n_samples = 500 n_variants = 200 n_orig_markers = 100 n_culprits = 10 n_covariates = 3 sigma_sq = 1 tau_sq = 1 from numpy.random import RandomState prng = RandomState(seed) x = np.hstack((np.ones(shape=(n_samples, 1)), prng.normal(size=(n_samples, n_covariates - 1)))) mt = hl.balding_nichols_model(n_populations=n_populations, n_samples=n_samples, n_variants=n_variants, fst=fst, af_dist=hl.rand_unif(0.1, 0.9, seed=seed), seed=seed) pa_t_path = utils.new_temp_file(suffix='bm') a_t_path = utils.new_temp_file(suffix='bm') BlockMatrix.write_from_entry_expr(mt.GT.n_alt_alleles(), a_t_path) a = BlockMatrix.read(a_t_path).T.to_numpy() g = a[:, -n_orig_markers:] g_std = self._filter_and_standardize_cols(g) n_markers = g_std.shape[1] k = (g_std @ g_std.T) * n_samples / n_markers beta = np.arange(n_covariates) beta_stars = np.array([1] * n_culprits) y = prng.multivariate_normal( np.hstack((a[:, 0:n_culprits], x)) @ np.hstack((beta_stars, beta)), sigma_sq * k + tau_sq * np.eye(n_samples)) # low rank computation of S, P l = g_std.T @ g_std sl, v = np.linalg.eigh(l) n_eigenvectors = int(np.sum(sl > 1e-10)) sl = sl[-n_eigenvectors:] v = v[:, -n_eigenvectors:] s = sl * (n_samples / n_markers) p = (g_std @ (v / np.sqrt(sl))).T # compare with full rank S, P sk0, uk = np.linalg.eigh(k) sk = sk0[-n_eigenvectors:] pk = uk[:, -n_eigenvectors:].T assert np.allclose(sk, s) assert np.allclose(np.abs(pk), np.abs(p)) # build and fit model py = p @ y px = p @ x pa = p @ a model = LinearMixedModel(py, px, s, y, x) assert model.n == n_samples assert model.f == n_covariates assert model.r == n_eigenvectors assert model.low_rank model.fit() # check effect sizes tend to be near 1 for first n_marker alternative models BlockMatrix.from_numpy(pa).T.write(pa_t_path, force_row_major=True) df_lmm = model.fit_alternatives(pa_t_path, a_t_path).to_pandas() assert 0.9 < np.mean(df_lmm['beta'][:n_culprits]) < 1.1 # compare NumPy and Hail LMM per alternative df_numpy = model.fit_alternatives_numpy(pa, a).to_pandas() assert np.min(df_numpy['chi_sq']) > 0 na_numpy = df_numpy.isna().any(axis=1) na_lmm = df_lmm.isna().any(axis=1) assert na_numpy.sum() <= 10 assert na_lmm.sum() <= 10 assert np.logical_xor(na_numpy, na_lmm).sum() <= 5 mask = ~(na_numpy | na_lmm) lmm_vs_numpy_p_value = np.sort(np.abs(df_lmm['p_value'][mask] - df_numpy['p_value'][mask])) assert lmm_vs_numpy_p_value[10] < 1e-12 # 10 least p-values differences assert lmm_vs_numpy_p_value[-1] < 1e-8 # all p-values
mt = mt.filter_cols(mt.super_population == 'EUR') mt = hl.variant_qc(mt) mt = mt.filter_rows((mt.variant_qc.AF[0] > 0.001) & (mt.variant_qc.AF[1] > 0.001)) BlockMatrix.write_from_entry_expr( entry_expr=mt.GT.n_alt_alleles(), path= 'gs://hail-datasets-hail-data/1000_Genomes_phase3_European_autosomes_maf_gt_001.bm', mean_impute=True, center=False, normalize=False, block_size=4096, overwrite=True) bm = BlockMatrix.read( 'gs://hail-datasets-hail-data/1000_Genomes_phase3_European_autosomes_maf_gt_001.bm' ) metadata = hl.struct(name='1000_Genomes_phase3_European_autosomes_maf_gt_001', reference_genome='GRCh37', n_rows=bm.n_rows, n_cols=bm.n_cols, block_size=bm.block_size) hl.experimental.write_expression( metadata, 'gs://hail-datasets-hail-data/1000_Genomes_phase3_European_autosomes_maf_gt_001.metadata.he', overwrite=True)
def from_random_effects(cls, y, x, z, p_path=None, overwrite=False, max_condition_number=1e-10, complexity_bound=8192): r"""Initializes a model from :math:`y`, :math:`X`, and :math:`Z`. Examples -------- >>> from hail.stats import LinearMixedModel >>> y = np.array([0.0, 1.0, 8.0, 9.0]) >>> x = np.array([[1.0, 0.0], ... [1.0, 2.0], ... [1.0, 1.0], ... [1.0, 4.0]]) >>> z = np.array([[0.0, 0.0, 1.0], ... [0.0, 1.0, 2.0], ... [1.0, 2.0, 4.0], ... [2.0, 4.0, 8.0]]) >>> model, p = LinearMixedModel.from_random_effects(y, x, z) >>> model.fit() >>> model.h_sq 0.38205307244271675 Notes ----- If :math:`n \leq m`, the returned model is full rank. If :math:`n > m`, the returned model is low rank. In this case only, eigenvalues less than or equal to `max_condition_number` times the top eigenvalue are dropped from :math:`S`, with the corresponding eigenvectors dropped from :math:`P`. This guards against precision loss on left eigenvectors computed via the right gramian :math:`Z^T Z` in :meth:`BlockMatrix.svd`. In either case, one can truncate to a rank :math:`r` model as follows. If `p` is an ndarray: >>> p_r = p[:r, :] # doctest: +SKIP >>> s_r = model.s[:r] # doctest: +SKIP >>> model_r = LinearMixedModel(p_r @ y, p_r @ x, s_r, y, x) # doctest: +SKIP If `p` is a block matrix: >>> p[:r, :].write(p_r_path) # doctest: +SKIP >>> p_r = BlockMatrix.read(p_r_path) # doctest: +SKIP >>> s_r = model.s[:r] # doctest: +SKIP >>> model_r = LinearMixedModel(p_r @ y, p_r @ x, s_r, y, x, p_r_path) # doctest: +SKIP This method applies no standardization to `z`. Warning ------- If `z` is a block matrix, then ideally `z` should be the result of directly reading from disk (and possibly a transpose). This is most critical if :math:`n > m`, because in this case multiplication by `z` will result in all preceding transformations being repeated ``n / block_size`` times, as explained in :class:`.BlockMatrix`. At least one dimension must be less than or equal to 46300. See the warning in :meth:`.BlockMatrix.svd` for performance considerations. Parameters ---------- y: :class:`ndarray` :math:`n` vector of observations :math:`y`. x: :class:`ndarray` :math:`n \times p` matrix of fixed effects :math:`X`. z: :class:`ndarray` or :class:`BlockMatrix` :math:`n \times m` matrix of random effects :math:`Z`. p_path: :obj:`str`, optional Path at which to write :math:`P` as a block matrix. Required if `z` is a block matrix. overwrite: :obj:`bool` If ``True``, overwrite an existing file at `p_path`. max_condition_number: :obj:`float` Maximum condition number. Must be greater than 1e-16. complexity_bound: :obj:`int` Complexity bound for :meth:`.BlockMatrix.svd` when `z` is a block matrix. Returns ------- model: :class:`LinearMixedModel` Model constructed from :math:`y`, :math:`X`, and :math:`Z`. p: :class:`ndarray` or :class:`.BlockMatrix` Matrix :math:`P` whose rows are the eigenvectors of :math:`K`. The type is block matrix if `z` is a block matrix and :meth:`.BlockMatrix.svd` of `z` returns :math:`U` as a block matrix. """ z_is_bm = isinstance(z, BlockMatrix) if z_is_bm and p_path is None: raise ValueError("from_random_effects: 'p_path' required when 'z'" "is a block matrix.") if max_condition_number < 1e-16: raise ValueError("from_random_effects: 'max_condition_number' must " f"be at least 1e-16, found {max_condition_number}") _check_dims(y, "y", 1) _check_dims(x, "x", 2) _check_dims(z, "z", 2) n, m = z.shape if y.shape[0] != n: raise ValueError("from_random_effects: 'y' and 'z' must have the " "same number of rows") if x.shape[0] != n: raise ValueError("from_random_effects: 'x' and 'z' must have the " "same number of rows") if z_is_bm: u, s0, _ = z.svd(complexity_bound=complexity_bound) p = u.T p_is_bm = isinstance(p, BlockMatrix) else: u, s0, _ = hl.linalg._svd(z, full_matrices=False) p = u.T p_is_bm = False s = s0 ** 2 low_rank = n > m if low_rank: assert np.all(np.isfinite(s)) r = np.searchsorted(-s, -max_condition_number * s[0]) if r < m: info(f'from_random_effects: model rank reduced from {m} to {r} ' f'due to ill-condition.' f'\n Largest dropped eigenvalue was {s[r]}.') s = s[:r] p = p[:r, :] if p_path is not None: if p_is_bm: p.write(p_path, overwrite=overwrite) p = BlockMatrix.read(p_path) else: BlockMatrix.from_numpy(p).write(p_path, overwrite=overwrite) if p_is_bm: py, px = (p @ y.reshape(n, 1)).to_numpy().flatten(), (p @ x).to_numpy() else: py, px = p @ y, p @ x if low_rank: model = LinearMixedModel(py, px, s, y, x, p_path) else: model = LinearMixedModel(py, px, s, p_path=p_path) return model, p
def __init__(self, py, px, s, y=None, x=None, p_path=None): if y is None and x is None: low_rank = False elif y is not None and x is not None: low_rank = True else: raise ValueError('for low-rank, set both y and x; for full-rank, do not set y or x.') _check_dims(py, 'py', 1) _check_dims(px, 'px', 2) _check_dims(s, 's', 1) r = s.size f = px.shape[1] if py.size != r: raise ValueError("py and s must have the same size") if px.shape[0] != r: raise ValueError("px must have the same number of rows as the size of s") if low_rank: _check_dims(y, 'y', 1) _check_dims(x, 'x', 2) n = y.size if n <= r: raise ValueError("size of y must be larger than the size of s") if x.shape[0] != n: raise ValueError("x must have the same number of rows as the size of y") if x.shape[1] != f: raise ValueError("px and x must have the same number columns") else: n = r if p_path is not None: n_rows, n_cols = BlockMatrix.read(p_path).shape if n_cols != n: raise ValueError("LinearMixedModel: Number of columns in the block " f"matrix at 'p_path' ({n_cols}) must equal " f"the size of 'y' ({n})") if n_rows != r: raise ValueError("LinearMixedModel: Number of rows in the block " f"matrix at 'p_path' ({n_rows}) must equal " f"the size of 'py' ({r})") self.low_rank = low_rank self.n = n self.f = f self.r = r self.py = py self.px = px self.s = s self.y = y self.x = x self.p_path = p_path self._check_dof() self.beta = None self.sigma_sq = None self.tau_sq = None self.gamma = None self.log_gamma = None self.h_sq = None self.h_sq_standard_error = None self.optimize_result = None self._fitted = False if low_rank: self._yty = y @ y self._xty = x.T @ y self._xtx = x.T @ x self._dof = n - f self._d = None self._ydy = None self._xdy = None self._xdx = None self._dof_alt = n - (f + 1) self._d_alt = None self._ydy_alt = None self._xdy_alt = np.zeros(f + 1) self._xdx_alt = np.zeros((f + 1, f + 1)) self._residual_sq = None self._scala_model = None
import hail as hl from hail.linalg import BlockMatrix from os import path import sys import pandas as pd chr_id, group_id = sys.argv[1], sys.argv[2] print(chr_id + ' ' + group_id) idx_comb = pd.read_csv("mapLDref.tsv.gz", sep="\t") idx_comb['chr'] = idx_comb['chr'].astype(str) idx_comb['group'] = idx_comb['group'].astype(str) idx_comb_chr = idx_comb[idx_comb.chr == chr_id] chridx = idx_comb_chr[idx_comb_chr.group == group_id].idx.tolist() ext = 'chr' + chr_id + '.' + group_id ## Load data bm = BlockMatrix.read('s3a://pan-ukb-us-east-1/ld_release/UKBB.EUR.ldadj.bm') bmchr = bm.filter(chridx, chridx) bmchr.write(ext + '.bm', force_row_major=True) BlockMatrix.export(ext + '.bm', ext + '.csv.bgz', delimiter='\t')
def get_ld_matrix(pop: str): return BlockMatrix.read(ld_matrix_path('genomes', pop))
def ld_score(entry_expr, locus_expr, radius, coord_expr=None, annotation_exprs=None, block_size=None) -> Table: """Calculate LD scores. Example ------- >>> # Load genetic data into MatrixTable >>> mt = hl.import_plink(bed='data/ldsc.bed', ... bim='data/ldsc.bim', ... fam='data/ldsc.fam') >>> # Create locus-keyed Table with numeric variant annotations >>> ht = hl.import_table('data/ldsc.annot', ... types={'BP': hl.tint, ... 'binary': hl.tfloat, ... 'continuous': hl.tfloat}) >>> ht = ht.annotate(locus=hl.locus(ht.CHR, ht.BP)) >>> ht = ht.key_by('locus') >>> # Annotate MatrixTable with external annotations >>> mt = mt.annotate_rows(binary_annotation=ht[mt.locus].binary, ... continuous_annotation=ht[mt.locus].continuous) >>> # Calculate LD scores using centimorgan coordinates >>> ht_scores = hl.experimental.ld_score(entry_expr=mt.GT.n_alt_alleles(), ... locus_expr=mt.locus, ... radius=1.0, ... coord_expr=mt.cm_position, ... annotation_exprs=[mt.binary_annotation, ... mt.continuous_annotation]) >>> # Show results >>> ht_scores.show(3) .. code-block:: text +---------------+-------------------+-----------------------+-------------+ | locus | binary_annotation | continuous_annotation | univariate | +---------------+-------------------+-----------------------+-------------+ | locus<GRCh37> | float64 | float64 | float64 | +---------------+-------------------+-----------------------+-------------+ | 20:82079 | 1.15183e+00 | 7.30145e+01 | 1.60117e+00 | | 20:103517 | 2.04604e+00 | 2.75392e+02 | 4.69239e+00 | | 20:108286 | 2.06585e+00 | 2.86453e+02 | 5.00124e+00 | +---------------+-------------------+-----------------------+-------------+ Warning ------- :func:`.ld_score` will fail if ``entry_expr`` results in any missing values. The special float value ``nan`` is not considered a missing value. **Further reading** For more in-depth discussion of LD scores, see: - `LD Score regression distinguishes confounding from polygenicity in genome-wide association studies (Bulik-Sullivan et al, 2015) <https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4495769/>`__ - `Partitioning heritability by functional annotation using genome-wide association summary statistics (Finucane et al, 2015) <https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4626285/>`__ Notes ----- `entry_expr`, `locus_expr`, `coord_expr` (if specified), and `annotation_exprs` (if specified) must come from the same MatrixTable. Parameters ---------- entry_expr : :class:`.NumericExpression` Expression for entries of genotype matrix (e.g. ``mt.GT.n_alt_alleles()``). locus_expr : :class:`.LocusExpression` Row-indexed locus expression. radius : :obj:`int` or :obj:`float` Radius of window for row values (in units of `coord_expr` if set, otherwise in units of basepairs). coord_expr: :class:`.Float64Expression`, optional Row-indexed numeric expression for the row value used to window variants. By default, the row value is given by the locus position. annotation_exprs : :class:`.NumericExpression` or :obj:`list` of :class:`.NumericExpression`, optional Annotation expression(s) to partition LD scores. Univariate annotation will always be included and does not need to be specified. block_size : :obj:`int`, optional Block size. Default given by :meth:`.BlockMatrix.default_block_size`. Returns ------- :class:`.Table` Table keyed by `locus_expr` with LD scores for each variant and `annotation_expr`. The function will always return LD scores for the univariate (all SNPs) annotation.""" mt = entry_expr._indices.source mt_locus_expr = locus_expr._indices.source if coord_expr is None: mt_coord_expr = mt_locus_expr else: mt_coord_expr = coord_expr._indices.source if not annotation_exprs: check_mts = all([mt == mt_locus_expr, mt == mt_coord_expr]) else: check_mts = all([mt == mt_locus_expr, mt == mt_coord_expr] + [mt == x._indices.source for x in wrap_to_list(annotation_exprs)]) if not check_mts: raise ValueError("""ld_score: entry_expr, locus_expr, coord_expr (if specified), and annotation_exprs (if specified) must come from same MatrixTable.""") n = mt.count_cols() r2 = hl.row_correlation(entry_expr, block_size) ** 2 r2_adj = ((n-1.0) / (n-2.0)) * r2 - (1.0 / (n-2.0)) starts, stops = hl.linalg.utils.locus_windows(locus_expr, radius, coord_expr) r2_adj_sparse = r2_adj.sparsify_row_intervals(starts, stops) r2_adj_sparse_tmp = new_temp_file() r2_adj_sparse.write(r2_adj_sparse_tmp) r2_adj_sparse = BlockMatrix.read(r2_adj_sparse_tmp) if not annotation_exprs: cols = ['univariate'] col_idxs = {0: 'univariate'} l2 = r2_adj_sparse.sum(axis=1) else: ht = mt.select_rows(*wrap_to_list(annotation_exprs)).rows() ht = ht.annotate(univariate=hl.literal(1.0)) names = [name for name in ht.row if name not in ht.key] ht_union = hl.Table.union( *[(ht.annotate(name=hl.str(x), value=hl.float(ht[x])) .select('name', 'value')) for x in names]) mt_annotations = ht_union.to_matrix_table( row_key=list(ht_union.key), col_key=['name']) cols = mt_annotations.key_cols_by()['name'].collect() col_idxs = {i: cols[i] for i in range(len(cols))} a_tmp = new_temp_file() BlockMatrix.write_from_entry_expr(mt_annotations.value, a_tmp) a = BlockMatrix.read(a_tmp) l2 = r2_adj_sparse @ a l2_bm_tmp = new_temp_file() l2_tsv_tmp = new_temp_file() l2.write(l2_bm_tmp, force_row_major=True) BlockMatrix.export(l2_bm_tmp, l2_tsv_tmp) ht_scores = hl.import_table(l2_tsv_tmp, no_header=True, impute=True) ht_scores = ht_scores.add_index() ht_scores = ht_scores.key_by('idx') ht_scores = ht_scores.rename({'f{:}'.format(i): col_idxs[i] for i in range(len(cols))}) ht = mt.select_rows(__locus=locus_expr).rows() ht = ht.add_index() ht = ht.annotate(**ht_scores[ht.idx]) ht = ht.key_by('__locus') ht = ht.select(*[x for x in ht_scores.row if x not in ht_scores.key]) ht = ht.rename({'__locus': 'locus'}) return ht
def compute_test_prs_bm(genotype_bm_path, prs_bm_path, args): sumstats_bm = BlockMatrix.read( get_clump_sumstats_bm_path(args.high_quality)) genotype_bm = BlockMatrix.read(genotype_bm_path) prs_bm: BlockMatrix = genotype_bm.T @ sumstats_bm prs_bm.write(prs_bm_path, args.overwrite)
def main(args): pop = args.pop num_pcs = 10 basic_covars = ['sex', 'age', 'age2', 'age_sex', 'age2_sex'] covariates = basic_covars + [f'PC{x}' for x in range(1, num_pcs + 1)] tmp_mt_path = f'{temp_bucket_7day}/{pop}.mt' tmp_bm_path = f'{temp_bucket_7day}/{pop}.bm' if args.write_mt: mt = get_filtered_mt(chrom='all', pop=pop, entry_fields=['dosage'], min_mac=19, filter_mac_instead_of_ac=True) mt_x = get_filtered_mt(chrom='X', pop=pop, entry_fields=['dosage'], min_mac=19, filter_mac_instead_of_ac=True) mt = mt.union_rows(mt_x) mt = mt.annotate_rows(AF=hl.agg.mean(mt.dosage) / 2) mt = mt.checkpoint(tmp_mt_path, overwrite=args.overwrite) n = mt.count()[1] # write variant indexes ht = mt.rows().select().add_index() ht = ht.annotate_globals(n_samples=n, pop=pop) ht.write(get_ld_variant_index_path(pop), overwrite=args.overwrite) else: mt = hl.read_matrix_table(tmp_mt_path) n = mt.count()[1] if args.write_bm: # convert mt to bm BlockMatrix.write_from_entry_expr(mt.dosage, tmp_bm_path, mean_impute=True, center=False, normalize=False, overwrite=args.overwrite) bm = BlockMatrix.read(tmp_bm_path) if args.compute_ld_matrix: print(f'BlockMatrix shape: {bm.shape}') # mean-center and normalize bm bm_norm = normalize_bm(bm) bm_norm = checkpoint_tmp(bm_norm) # take covariates (with intercept), make hat bms for FWL projection cov = mt.cols().select(*covariates).to_pandas().drop(['s'], axis=1) cov['Intercept'] = 1.0 hat1 = cov.values hat2 = np.dot(np.linalg.inv(np.dot(cov.transpose(), cov)), cov.transpose()) bm_hat1 = checkpoint_tmp(BlockMatrix.from_numpy(hat1)) bm_hat2 = checkpoint_tmp(BlockMatrix.from_numpy(hat2)) # Cov-adjustement; conducting in three steps due to huge matrix operation bm_Z = checkpoint_tmp(bm_norm @ bm_hat1) bm_Z = checkpoint_tmp(bm_Z @ bm_hat2) bm_Z = checkpoint_tmp(bm_norm - bm_Z) # compute ld matrix with a specified radius bm_ldadj = (bm_Z @ bm_Z.T) / n starts_and_stops = hl.linalg.utils.locus_windows(mt.locus, radius=args.radius, _localize=False) bm_ldadj = bm_ldadj._sparsify_row_intervals_expr(starts_and_stops, blocks_only=False) # sparcify to a triangle matrix bm_ldadj = bm_ldadj.sparsify_triangle() bm_ldadj = bm_ldadj.checkpoint(get_ld_matrix_path(pop), overwrite=args.overwrite, force_row_major=True) else: bm_ldadj = BlockMatrix.read(get_ld_matrix_path(pop)) if args.write_ldsc_hm3_snplist: # Note: currently, this writes snplists for all the populations at once write_ldsc_hm3_snplist(overwrite=args.overwrite) if args.compute_ldscore: ht_ldscore = copmute_ldscore(mt.rows(), bm_ldadj, n, radius=args.ld_score_radius, out_name=get_ld_score_ht_path(pop), overwrite=args.overwrite) export_ldscore(ht_ldscore, pop)
def main(args): hl.init(default_reference='GRCh37', log='/prs.log', spark_conf={'spark.hadoop.fs.gs.requester.pays.mode': 'AUTO', 'spark.hadoop.fs.gs.requester.pays.project.id': 'ukbb-diversepops-neale'}) if args.prepare_sumstats_matrix: # get meta mt and separate by pop combo meta_mt = hl.read_matrix_table(get_meta_analysis_results_path()) meta_mt = separate_results_mt_by_pop(meta_mt, 'meta_analysis_data', 'meta_analysis') meta_mt = meta_mt.annotate_cols(clump_pops=meta_mt.meta_analysis_data.pop) meta_mt = meta_mt.key_cols_by('clump_pops', *meta_mt.col_key) # get sumstats mt and separate by pop combo ss_mt = get_final_sumstats_mt_for_export() ss_mt = separate_results_mt_by_pop(ss_mt, 'pheno_data', 'summary_stats') ss_mt = ss_mt.annotate_cols(clump_pops=hl.array([ss_mt.pheno_data.pop])) ss_mt = ss_mt.key_cols_by(*meta_mt.col_key) # join meta results and sumstats mt # NOTE: union_cols() requires the same entry fields schema meta_mt = meta_mt.select_entries(BETA = meta_mt.meta_analysis.BETA, Pvalue = meta_mt.meta_analysis.Pvalue).select_cols().select_rows() ss_mt = ss_mt.select_entries(BETA = ss_mt.summary_stats.BETA, Pvalue = ss_mt.summary_stats.Pvalue).select_cols().select_rows() mt = meta_mt.union_cols(ss_mt) # filter to distinct cols # NOTE: distinct_by_col() does not allow a col key of type `list` mt = mt.annotate_cols(clump_pops_str = hl.delimit(mt.clump_pops)).key_cols_by('clump_pops_str', *[k for k in mt.col_key if k!='clump_pops']).distinct_by_col() mt = mt.distinct_by_col() # ensure that betas are not missing ss_mt = ss_mt.annotate_cols(clump_pops_str = hl.delimit(ss_mt.clump_pops)).key_cols_by('clump_pops_str', *[k for k in ss_mt.col_key if k!='clump_pops']) mt = mt.annotate_entries(BETA = hl.or_else(mt.BETA, ss_mt[mt.row_key, mt.col_key].BETA), Pvalue = hl.or_else(mt.Pvalue, ss_mt[mt.row_key, mt.col_key].Pvalue)) # read clump mt and separate by pop combo clump_mt = hl.read_matrix_table(get_clumping_results_path(high_quality=args.high_quality, max_pops=args.max_pops)) if args.max_pops: # if max_pops=True, the clump_mt is already separated by pop # these steps are necessary to make downstream code usable for both max_pops=True/False clump_mt = clump_mt.annotate_entries(plink_clump = hl.struct(TOTAL = clump_mt.TOTAL)) clump_mt = clump_mt.annotate_cols(pop_index = 0) else: clump_mt = separate_results_mt_by_pop(clump_mt, 'clump_pops', 'plink_clump', skip_drop=True) clump_mt = clump_mt.annotate_cols(clump_pops_str = hl.delimit(clump_mt.clump_pops)) clump_mt = clump_mt.drop('clump_pops').key_cols_by(*mt.col_key) # join sumstats/meta-analysis with clump mt mt = all_axis_join(mt, clump_mt) mt = mt.filter_cols(hl.is_defined(mt.pop_index)) print(f'\n\nMatrix dimensions (before explode by p-threshold): {mt.count()}\n') mt = explode_by_p_threshold(mt).unfilter_entries() # Write pheno data for later use mt.add_col_index('idx').key_cols_by('idx').cols().write( get_clump_sumstats_col_ht_path(high_quality=args.high_quality, max_pops=args.max_pops), args.overwrite) BlockMatrix.write_from_entry_expr( hl.or_else(mt.BETA * hl.is_defined(mt.plink_clump.TOTAL) * hl.int(mt.Pvalue < mt.p_threshold), 0.0), get_clump_sumstats_bm_path(high_quality=args.high_quality, max_pops=args.max_pops), args.overwrite) # 2020-06-25 01:49:32 Hail: INFO: Wrote all 7078 blocks of 28987534 x 3530 matrix with block size 4096. # If clump_mt is significantly smaller than meta_mt, consider putting that on the left of the join, # then filter the genotype matrix to only those SNPs (pilot would go from 28.9M -> 21.2M) if args.prepare_genotype_matrix: meta_mt = hl.read_matrix_table(get_meta_analysis_results_path()) mt = get_filtered_mt_with_x() mt = mt.filter_rows(hl.is_defined(meta_mt.rows()[mt.row_key])) # Write sample data for later use mt = mt.key_cols_by(userId=hl.int32(mt.s)) mt.cols().add_index().write(genotype_samples_ht_path, args.overwrite) BlockMatrix.write_from_entry_expr(mt.dosage, genotype_bm_path, args.overwrite) # 2020-06-25 19:18:14 Hail: INFO: Wrote all 764424 blocks of 28987534 x 441345 matrix with block size 4096. if args.compute_prs: sumstats_bm = BlockMatrix.read(get_clump_sumstats_bm_path(high_quality=args.high_quality, max_pops=args.max_pops)) genotype_bm = BlockMatrix.read(genotype_bm_path) mul_splits = 197 # sumstats_bm.shape[1]//10000*10 sum_splits = 20 #int(mul_splits/10) assert mul_splits>10 # if not more than 10, sum_splits is not necessary prs_bm = tree_matmul_tree_matsum(genotype_bm.T, sumstats_bm, mul_splits=mul_splits, sum_splits=sum_splits, path_prefix = f'{temp_bucket}/prs/tree_matmul{"_max_pops" if args.max_pops else ""}', read_if_exists = True) prs_bm.write(get_prs_bm_path(high_quality=args.high_quality, max_pops=args.max_pops), args.overwrite) if args.create_prs_mt: prs_bm = BlockMatrix.read(get_prs_bm_path(high_quality=args.high_quality, max_pops=args.max_pops)) pheno_ht = hl.read_table(get_clump_sumstats_col_ht_path(high_quality=args.high_quality, max_pops=args.max_pops)).key_by('idx') samples_ht = hl.read_table(genotype_samples_ht_path).key_by('idx') # 10k partitions for 370 GB table (441k x 108k) = 37 MB/partition # 5014 partitions for 240 GB table (441k x 72k) = 48 MB/partition (max_pops) n_partitions = 15000 #int(1000*(pheno_ht.count()/72*5)//1000) # or hard code mt = BlockMatrix.to_matrix_table_row_major(prs_bm, n_partitions=n_partitions).rename({'element': 'score'}) mt = mt.annotate_cols(**pheno_ht[mt.col_key]).key_cols_by(*PHENO_KEY_FIELDS) mt = mt.annotate_rows(**samples_ht[mt.row_key]).key_rows_by('userId') mt.write(get_prs_mt_path(high_quality=args.high_quality, max_pops=args.max_pops), args.overwrite) if args.assess_prs: prs_mt = hl.read_matrix_table(get_prs_mt_path(high_quality=args.high_quality, max_pops=args.max_pops)) pheno_mt = get_ukb_pheno_mt() # TODO: fix all phenos to new keying scheme pheno_mt = pheno_mt.key_cols_by( **pheno_mt.col_key.annotate(modifier=hl.if_else(pheno_mt.trait_type == "biomarkers", "irnt", pheno_mt.modifier))) mt = prs_mt.annotate_entries(**pheno_mt[prs_mt.row_key, prs_mt.col_key]) mt = mt.annotate_cols(description = pheno_mt.cols()[mt.col_key].description) for pop in POPS: mt_pop = mt.filter_rows(mt.pop==pop) mt_pop = mt_pop.annotate_cols(prs_corr=hl.agg.linreg(mt_pop.both_sexes, [1.0, mt_pop.score])) cols = mt_pop.cols() cols.select('description', 'p_threshold', clump_pops_str=hl.delimit(cols.clump_pops,'-'), prs_corr_r2=cols.prs_corr.multiple_r_squared, prs_corr_pval=cols.prs_corr.p_value[1], prs_corr_n=cols.prs_corr.n).export(f'gs://ukbb-diverse-temp-30day/prs/assess_prs{"_max_pops" if args.max_pops else ""}.{pop}.tsv.gz')
def checkpoint_bm(bm, path, read_if_exists=True): if not hl.hadoop_is_file(f'{path}/_SUCCESS'): bm.write(path) bm = BlockMatrix.read(path) return bm