def test_pca(self): dataset = hl.balding_nichols_model(3, 100, 100) eigenvalues, scores, loadings = hl.pca(dataset.GT.n_alt_alleles(), k=2, compute_loadings=True) self.assertEqual(len(eigenvalues), 2) self.assertTrue(isinstance(scores, hl.Table)) self.assertEqual(scores.count(), 100) self.assertTrue(isinstance(loadings, hl.Table)) self.assertEqual(loadings.count(), 100) _, _, loadings = hl.pca(dataset.GT.n_alt_alleles(), k=2, compute_loadings=False) self.assertEqual(loadings, None)
def run_platform_pca( callrate_mt: hl.MatrixTable, binarization_threshold: Optional[float] = 0.25 ) -> Tuple[List[float], hl.Table, hl.Table]: """ Runs a PCA on a sample/interval MT with each entry containing the call rate. When `binzarization_threshold` is set, the callrate is transformed to a 0/1 value based on the threshold. E.g. with the default threshold of 0.25, all entries with a callrate < 0.25 are considered as 0s, others as 1s. :param callrate_mt: Input callrate MT :param binarization_threshold: binzarization_threshold. None is no threshold desired :return: eigenvalues, scores_ht, loadings_ht """ logger.info("Running platform PCA") if binarization_threshold is not None: callrate_mt = callrate_mt.annotate_entries(callrate=hl.int( callrate_mt.callrate > binarization_threshold)) # Center until Hail's PCA does it for you callrate_mt = callrate_mt.annotate_rows( mean_callrate=hl.agg.mean(callrate_mt.callrate)) callrate_mt = callrate_mt.annotate_entries(callrate=callrate_mt.callrate - callrate_mt.mean_callrate) eigenvalues, scores, loadings = hl.pca( callrate_mt.callrate, compute_loadings=True ) # TODO: Evaluate whether computing loadings is a good / worthy thing logger.info("Platform PCA eigenvalues: {}".format(eigenvalues)) return eigenvalues, scores, loadings
def test_pca_against_numpy(): mt = hl.import_vcf(resource('tiny_m.vcf')) mt = mt.filter_rows(hl.len(mt.alleles) == 2) mt = mt.annotate_rows(AC=hl.agg.sum(mt.GT.n_alt_alleles()), n_called=hl.agg.count_where(hl.is_defined(mt.GT))) mt = mt.filter_rows((mt.AC > 0) & (mt.AC < 2 * mt.n_called)).persist() n_rows = mt.count_rows() def make_expr(mean): return hl.if_else(hl.is_defined(mt.GT), (mt.GT.n_alt_alleles() - mean) / hl.sqrt(mean * (2 - mean) * n_rows / 2), 0) eigen, scores, loadings = hl.pca(hl.bind(make_expr, mt.AC / mt.n_called), k=3, compute_loadings=True) hail_scores = scores.explode('scores').scores.collect() hail_loadings = loadings.explode('loadings').loadings.collect() assert len(eigen) == 3 assert scores.count() == mt.count_cols() assert loadings.count() == n_rows assert len(scores.globals) == 0 assert len(loadings.globals) == 0 # compute PCA with numpy def normalize(a): ms = np.mean(a, axis=0, keepdims=True) return np.divide( np.subtract(a, ms), np.sqrt(2.0 * np.multiply(ms / 2.0, 1 - ms / 2.0) * a.shape[1])) g = np.pad(np.diag([1.0, 1, 2]), ((0, 1), (0, 0)), mode='constant') g[1, 0] = 1.0 / 3 n = normalize(g) U, s, V = np.linalg.svd(n, full_matrices=0) np_scores = U.dot(np.diag(s)).flatten() np_loadings = V.transpose().flatten() np_eigenvalues = np.multiply(s, s).flatten() np.testing.assert_allclose(eigen, np_eigenvalues, rtol=1e-5) np.testing.assert_allclose(np.abs(hail_scores), np.abs(np_scores), rtol=1e-5) np.testing.assert_allclose(np.abs(hail_loadings), np.abs(np_loadings), rtol=1e-5)
def main(args): hl.init(log='/platform_pca.log') if not args.skip_prepare_data_for_platform_pca: # ~1 hour on 800 cores (3/8/18) logger.info('Preparing data for platform PCA...') mt = get_gnomad_data('exomes', adj=True, raw=False, meta_root=None, fam_root=None, split=False) mt = filter_to_autosomes(mt) intervals = hl.import_locus_intervals(evaluation_intervals_path) mt = mt.annotate_rows(interval=intervals[mt.locus].target) mt = mt.filter_rows(hl.is_defined(mt.interval) & (hl.len(mt.alleles) == 2)) mt = mt.select_entries(GT=hl.or_missing(hl.is_defined(mt.GT), hl.struct())) callrate_mt = mt.group_rows_by(mt.interval).aggregate(callrate=hl.agg.fraction(hl.is_defined(mt.GT))) callrate_mt.write(exome_callrate_mt_path, args.overwrite) if not args.skip_run_platform_pca: logger.info('Running platform PCA...') qc_ht = hl.read_table(qc_ht_path('exomes', 'hard_filters')).key_by('s') callrate_mt = hl.read_matrix_table(exome_callrate_mt_path) callrate_mt = callrate_mt.filter_cols(hl.len(qc_ht[callrate_mt.col_key].hard_filters) == 0) callrate_mt = callrate_mt.annotate_entries(callrate=hl.int(callrate_mt.callrate > 0.25)) # Center until Hail's PCA does it for you callrate_mt = callrate_mt.annotate_rows(mean_callrate=hl.agg.mean(callrate_mt.callrate)) callrate_mt = callrate_mt.annotate_entries(callrate=callrate_mt.callrate - callrate_mt.mean_callrate) eigenvalues, scores, _ = hl.pca(callrate_mt.callrate, compute_loadings=False) logger.info('Eigenvalues: {}'.format(eigenvalues)) # [731282566.2824697, 78687228.90071851, 43837650.51729764, 33969298.61827205, 26308703.539534636, 21102437.512725923, 16949828.555817757, 12994894.187041137, 8372332.274295175, 8128326.814388647] scores.write(exome_callrate_scores_ht_path) logger.info('Annotating with platform PCs and known platform annotations...') scores = hl.read_table(exome_callrate_scores_ht_path).annotate(data_type='exomes') if args.pc_scores_in_separate_fields: scores = scores.transmute(scores=[ scores[ann] for ann in sorted( [ann for ann in scores.row if ann.startswith("PC")], key=lambda x: int(x[2:]) ) ]) platform_pcs = assign_platform_pcs(scores) platform_pcs.write(qc_ht_path('exomes', 'platforms'), overwrite=args.overwrite)
def test_blanczos_against_hail(): k = 10 def concatToNumpy(field, horizontal=True): blocks = field.collect() if horizontal: return np.concatenate(blocks, axis=0) else: return np.concatenate(blocks, axis=1) hl.utils.get_1kg('data/') hl.import_vcf('data/1kg.vcf.bgz').write('data/1kg.mt', overwrite=True) dataset = hl.read_matrix_table('data/1kg.mt') b_eigens, b_scores, b_loadings = hl._blanczos_pca(hl.int( hl.is_defined(dataset.GT)), k=k, q_iterations=3, compute_loadings=True) b_scores = concatToNumpy(b_scores.scores) b_loadings = concatToNumpy(b_loadings.loadings) b_scores = np.reshape(b_scores, (len(b_scores) // k, k)) b_loadings = np.reshape(b_loadings, (len(b_loadings) // k, k)) h_eigens, h_scores, h_loadings = hl.pca(hl.int(hl.is_defined(dataset.GT)), k=k, compute_loadings=True) h_scores = np.reshape(concatToNumpy(h_scores.scores), b_scores.shape) h_loadings = np.reshape(concatToNumpy(h_loadings.loadings), b_loadings.shape) # equation 12 from https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4827102/pdf/main.pdf def bound(vs, us): return 1 / k * sum([np.linalg.norm(us.T @ vs[:, i]) for i in range(k)]) MEV = bound(h_loadings, b_loadings) np.testing.assert_allclose(b_eigens, h_eigens, rtol=0.05) assert MEV > 0.9