def make_sample_rank_table(phe_ht: hl.Table) -> hl.Table: """ Make table with rank of sample sorted by retention priority (lower rank has higher priority). It mainly uses two bits of information: - cases are prioritised over controls - samples are preferred based on the cohort info as follow: chd > ddd > ukbb :param phe_ht: Table with sample meta-data annotations (e.g. phenotype, cohort info...) :return: Hail Table """ phe_ht = ( phe_ht.annotate( case_control_rank=hl.int( phe_ht['phe.is_case']), # 0: control, 1: cases cohort_rank=hl.case().when(phe_ht.is_ukbb, 10).when( phe_ht.is_ddd, 100).when(phe_ht.is_chd, 1000).or_missing()).key_by()) phe_ht = (phe_ht.select('ega_id', 'case_control_rank', 'cohort_rank')) # sort table (descending) tb_rank = (phe_ht.order_by(hl.desc(phe_ht.case_control_rank), hl.desc(phe_ht.cohort_rank))) tb_rank = (tb_rank.add_index(name='rank').key_by('ega_id')) tb_rank = tb_rank.annotate(rank=tb_rank.rank + 1) return tb_rank
def test_order_by(self): ht = hl.utils.range_table(10) self.assertEqual(ht.order_by('idx').idx.collect(), list(range(10))) self.assertEqual( ht.order_by(hl.asc('idx')).idx.collect(), list(range(10))) self.assertEqual( ht.order_by(hl.desc('idx')).idx.collect(), list(range(10))[::-1])
def interval_target_sum_ht(): int_ht = hl.read_table( get_ccdg_results_path(data_type="exomes", result=f"intervals_{INTERVAL_DP}x") ) int_ht = int_ht.explode(int_ht.target) int_ht = int_ht.annotate(target2=int_ht.target.split("\|")) int_ht = int_ht.explode(int_ht.target2) target_ht = int_ht.group_by("target2").aggregate( total_len=hl.agg.sum(int_ht.int_len), filtered_len=hl.agg.filter(int_ht.to_keep, hl.agg.sum(int_ht.int_len)), ) target_ht = target_ht.annotate( percent_len=target_ht.filtered_len / target_ht.total_len ) target_ht = target_ht.order_by(hl.desc(target_ht.total_len)) return target_ht
def _filter_agg_order( t: Union[hl.MatrixTable, hl.Table], group_exprs: Dict[str, hl.expr.Expression], n_rows: Optional[int] = None, n_cols: Optional[int] = None, ) -> None: """ Perform validity checks to measure percentages of variants filtered under different conditions. :param t: Input MatrixTable or Table. :param group_exprs: Dictionary of expressions to group the Table by. :param extra_filter_checks: Optional dictionary containing filter condition name (key) and extra filter expressions (value) to be examined. :param n_rows: Number of rows to show. Default is None (to display 10 rows). :param n_cols: Number of columns to show. Default is None (to display 10 cols). :return: None """ t = t.rows() if isinstance(t, hl.MatrixTable) else t # NOTE: make_filters_expr_dict returns a dict with %ages of variants filtered t.group_by(**group_exprs).aggregate(**make_filters_expr_dict( t, extra_filter_checks, variant_filter_field)).order_by( hl.desc("n")).show(n_rows, n_cols)
def test_order_by(self): ht = hl.utils.range_table(10) self.assertEqual(ht.order_by('idx').idx.collect(), list(range(10))) self.assertEqual(ht.order_by(hl.asc('idx')).idx.collect(), list(range(10))) self.assertEqual(ht.order_by(hl.desc('idx')).idx.collect(), list(range(10))[::-1])
def main(args): ht_snp = hl.import_table(args.snp, impute=True) ht_snp = ht_snp.annotate(variant=hl.delimit([ ht_snp.chromosome, hl.str(ht_snp.position), ht_snp.allele1, ht_snp.allele2 ], delimiter=':')) ht_snp = ht_snp.annotate( **hl.parse_variant(ht_snp.variant, reference_genome='GRCh38')) ht_snp = ht_snp.key_by('locus', 'alleles') ht_snp = ht_snp.add_index('idx_snp') ht_snp = ht_snp.checkpoint(new_temp_file()) # annotate vep gnomad = hl.read_table( 'gs://gnomad-public-requester-pays/release/3.0/ht/genomes/gnomad.genomes.r3.0.sites.ht' ) ht_snp = ht_snp.join(gnomad.select('vep'), how='left') ht_snp = process_consequences(ht_snp) # extract most severe ht_snp = ht_snp.annotate(vep=(hl.case().when( hl.is_defined(ht_snp.vep.worst_csq_for_variant_canonical), ht_snp.vep.worst_csq_for_variant_canonical).when( hl.is_defined(ht_snp.vep.worst_csq_for_variant), ht_snp.vep.worst_csq_for_variant).or_missing()), is_canonical_vep=hl.is_defined( ht_snp.vep.worst_csq_for_variant_canonical)) ht_snp = ht_snp.annotate(most_severe=hl.if_else( hl.is_defined(ht_snp.vep), ht_snp.vep.most_severe_consequence, 'intergenic_variant'), gene_most_severe=ht_snp.vep.gene_symbol) ht_snp = ht_snp.select_globals() ht_snp = ht_snp.drop('vep') ht_snp = ht_snp.annotate( **annotate_consequence_category(ht_snp.most_severe)) ht_snp = ht_snp.checkpoint(new_temp_file()) df = ht_snp.key_by().drop('locus', 'alleles', 'variant', 'idx_snp').to_pandas() # annotate LD for pop in POPS: ht = hl.read_table( f'gs://gnomad-public-requester-pays/release/2.1.1/ld/gnomad.genomes.r2.1.1.{pop}.common.adj.ld.variant_indices.ht' ) ht = ht.annotate(locus_hg38=hl.liftover(ht.locus, 'GRCh38')) ht = ht.filter(hl.is_defined(ht.locus_hg38)) ht = ht.key_by('locus_hg38', 'alleles').drop('locus') ht = ht_snp.join(ht, 'inner') ht = ht.checkpoint(new_temp_file()) lead_idx = ht.order_by(hl.desc(ht.prob)).head(1).idx.collect() idx = ht.idx.collect() bm = BlockMatrix.read( f'gs://gnomad-public-requester-pays/release/2.1.1/ld/gnomad.genomes.r2.1.1.{pop}.common.ld.bm' ) bm = bm.filter(idx, idx) # re-densify triangluar matrix bm = bm + bm.T - get_diag_mat(bm.diagonal()) bm = bm.filter_rows( np.where(np.array(idx) == lead_idx[0])[0].tolist())**2 idx_snp = ht.idx_snp.collect() r2 = bm.to_numpy()[0] df[f'gnomad_lead_r2_{pop}'] = np.nan df[f'gnomad_lead_r2_{pop}'].iloc[idx_snp] = r2 if args.out.startswith('gs://'): fopen = hl.hadoop_open else: fopen = open with fopen(args.out, 'w') as f: df.to_csv(f, sep='\t', na_rep='NA', index=False)
mu1[tissue_name] = [] sd[tissue_name] = [] n[tissue_name] = [] ems = hl.read_table("gs://qingbowang/ems_v1_test/ems_pcausal_gtexvg_all{0}.ht".format(tissue_name)) ems = ems.annotate(hg38_ID = ems.vg.split("_")[0] + "_" + ems.vg.split("_")[1] + "_" +ems.vg.split("_")[2] + "_" +ems.vg.split("_")[3]).key_by("hg38_ID").select("p_causal", "confidence_gain") for categ in allcateg: comp = hl.read_table("gs://qingbowang/UKBB_nc_pp_susie_maxpip_{0}.ht".format(categ)) ht0 = ems.join(comp, how="left") ht0 = ht0.filter(hl.is_defined(ht0.max_pip)) #remove those that does not contain complex trait information mu0[tissue_name].append(ht0.aggregate(hl.agg.mean(ht0.max_pip))) ht = ht0.filter(ht0.confidence_gain>10)#filter by confidence gain here to reduce n for order_by if ht.count()<10000: ht = ht0.filter(ht0.confidence_gain>1) if ht.count()<10000: ht = ht0.filter(ht0.confidence_gain>0.1) ht = ht.order_by(hl.desc(ht.p_causal)) ht = ht.add_index() ht = ht.annotate(top10k = ht.idx<10000) ht = ht.filter(ht.top10k) st = ht.aggregate(hl.agg.stats(ht.max_pip)) mu1[tissue_name].append(st.mean) sd[tissue_name].append(st.stdev) n[tissue_name].append(st.n) print ("done {0}, {1}".format(categ, tissue_name)) print (mu0[tissue_name]) print (st) mu0 = pd.DataFrame(mu0) mu1 = pd.DataFrame(mu1) sd = pd.DataFrame(sd) n = pd.DataFrame(n) mu0.index = allcateg