def test_lambda_gc(self): N = 5000000 ht = hl.utils.range_table(N).annotate(x=hl.scan.count() / N, x2=(hl.scan.count() / N)**1.5) lgc = hl.lambda_gc(ht.x) lgc2 = hl.lambda_gc(ht.x2) self.assertAlmostEqual(lgc, 1, places=1) # approximate, 1 place is safe self.assertAlmostEqual(lgc2, 1.89, places=1) # approximate, 1 place is safe
def qqplot(pvals, title: str = None, figsize: tuple = (10, 10)): source = pvals._indices.source if isinstance(source, Table): ht = source.select(p_value=pvals) else: ht = source.select_rows(p_value=pvals).rows() ht = ht.key_by().select('p_value').key_by('p_value').persist() lambda_gc = hl.lambda_gc(ht['p_value']) n = ht.count() ht = ht.annotate( observed_p=-hl.log10(ht['p_value']), expected_p=-hl.log10((hl.scan.count() + 1) / n), p_val=ht['p_value'] ).persist() p_val_pd = ht.to_pandas() p_val_pd['observed_p'].values[p_val_pd['observed_p'] > 10] = 10 mini = min(p_val_pd['expected_p'].max(), p_val_pd['observed_p'].max()) maxi = max(p_val_pd['expected_p'].max(), p_val_pd['observed_p'].max()) title = f'{title}' if title else 'QQ Plot' fig = plt.figure(figsize=figsize) plt.scatter(p_val_pd['expected_p'], p_val_pd['observed_p'], c='black', s=0.5) plt.plot((0, mini), (0, mini), 'red') plt.xlim([0, maxi + 0.5]) plt.ylim([0, maxi + 0.5]) plt.title(title, fontsize=20) plt.ylabel('Observed -log10(' + r'$p$' + ')', fontsize=15) plt.xlabel('Expected -log10(' + r'$p$' + ')', fontsize=15) plt.close() return fig, round(lambda_gc, 3)