def make_sample_rank_table(phe_ht: hl.Table) -> hl.Table:
    """
    Make table with rank of sample sorted by retention priority
    (lower rank has higher priority).
    It mainly uses two bits of information:
      - cases are prioritised over controls
      - samples are preferred based on the cohort info as follow: chd > ddd > ukbb
    :param phe_ht: Table with sample meta-data annotations (e.g. phenotype, cohort info...)
    :return: Hail Table
    """

    phe_ht = (
        phe_ht.annotate(
            case_control_rank=hl.int(
                phe_ht['phe.is_case']),  # 0: control, 1: cases
            cohort_rank=hl.case().when(phe_ht.is_ukbb, 10).when(
                phe_ht.is_ddd, 100).when(phe_ht.is_chd,
                                         1000).or_missing()).key_by())

    phe_ht = (phe_ht.select('ega_id', 'case_control_rank', 'cohort_rank'))

    # sort table (descending)
    tb_rank = (phe_ht.order_by(hl.desc(phe_ht.case_control_rank),
                               hl.desc(phe_ht.cohort_rank)))

    tb_rank = (tb_rank.add_index(name='rank').key_by('ega_id'))

    tb_rank = tb_rank.annotate(rank=tb_rank.rank + 1)

    return tb_rank
Esempio n. 2
0
 def test_order_by(self):
     ht = hl.utils.range_table(10)
     self.assertEqual(ht.order_by('idx').idx.collect(), list(range(10)))
     self.assertEqual(
         ht.order_by(hl.asc('idx')).idx.collect(), list(range(10)))
     self.assertEqual(
         ht.order_by(hl.desc('idx')).idx.collect(),
         list(range(10))[::-1])
Esempio n. 3
0
def interval_target_sum_ht():
    int_ht = hl.read_table(
        get_ccdg_results_path(data_type="exomes", result=f"intervals_{INTERVAL_DP}x")
    )
    int_ht = int_ht.explode(int_ht.target)
    int_ht = int_ht.annotate(target2=int_ht.target.split("\|"))
    int_ht = int_ht.explode(int_ht.target2)
    target_ht = int_ht.group_by("target2").aggregate(
        total_len=hl.agg.sum(int_ht.int_len),
        filtered_len=hl.agg.filter(int_ht.to_keep, hl.agg.sum(int_ht.int_len)),
    )
    target_ht = target_ht.annotate(
        percent_len=target_ht.filtered_len / target_ht.total_len
    )
    target_ht = target_ht.order_by(hl.desc(target_ht.total_len))
    return target_ht
    def _filter_agg_order(
        t: Union[hl.MatrixTable, hl.Table],
        group_exprs: Dict[str, hl.expr.Expression],
        n_rows: Optional[int] = None,
        n_cols: Optional[int] = None,
    ) -> None:
        """
        Perform validity checks to measure percentages of variants filtered under different conditions.

        :param t: Input MatrixTable or Table.
        :param group_exprs: Dictionary of expressions to group the Table by.
        :param extra_filter_checks: Optional dictionary containing filter condition name (key) and extra filter expressions (value) to be examined.
        :param n_rows: Number of rows to show. Default is None (to display 10 rows).
        :param n_cols: Number of columns to show. Default is None (to display 10 cols).
        :return: None
        """
        t = t.rows() if isinstance(t, hl.MatrixTable) else t
        # NOTE: make_filters_expr_dict returns a dict with %ages of variants filtered
        t.group_by(**group_exprs).aggregate(**make_filters_expr_dict(
            t, extra_filter_checks, variant_filter_field)).order_by(
                hl.desc("n")).show(n_rows, n_cols)
Esempio n. 5
0
 def test_order_by(self):
     ht = hl.utils.range_table(10)
     self.assertEqual(ht.order_by('idx').idx.collect(), list(range(10)))
     self.assertEqual(ht.order_by(hl.asc('idx')).idx.collect(), list(range(10)))
     self.assertEqual(ht.order_by(hl.desc('idx')).idx.collect(), list(range(10))[::-1])
def main(args):
    ht_snp = hl.import_table(args.snp, impute=True)
    ht_snp = ht_snp.annotate(variant=hl.delimit([
        ht_snp.chromosome,
        hl.str(ht_snp.position), ht_snp.allele1, ht_snp.allele2
    ],
                                                delimiter=':'))
    ht_snp = ht_snp.annotate(
        **hl.parse_variant(ht_snp.variant, reference_genome='GRCh38'))
    ht_snp = ht_snp.key_by('locus', 'alleles')
    ht_snp = ht_snp.add_index('idx_snp')
    ht_snp = ht_snp.checkpoint(new_temp_file())

    # annotate vep
    gnomad = hl.read_table(
        'gs://gnomad-public-requester-pays/release/3.0/ht/genomes/gnomad.genomes.r3.0.sites.ht'
    )
    ht_snp = ht_snp.join(gnomad.select('vep'), how='left')
    ht_snp = process_consequences(ht_snp)

    # extract most severe
    ht_snp = ht_snp.annotate(vep=(hl.case().when(
        hl.is_defined(ht_snp.vep.worst_csq_for_variant_canonical),
        ht_snp.vep.worst_csq_for_variant_canonical).when(
            hl.is_defined(ht_snp.vep.worst_csq_for_variant),
            ht_snp.vep.worst_csq_for_variant).or_missing()),
                             is_canonical_vep=hl.is_defined(
                                 ht_snp.vep.worst_csq_for_variant_canonical))
    ht_snp = ht_snp.annotate(most_severe=hl.if_else(
        hl.is_defined(ht_snp.vep), ht_snp.vep.most_severe_consequence,
        'intergenic_variant'),
                             gene_most_severe=ht_snp.vep.gene_symbol)
    ht_snp = ht_snp.select_globals()
    ht_snp = ht_snp.drop('vep')
    ht_snp = ht_snp.annotate(
        **annotate_consequence_category(ht_snp.most_severe))
    ht_snp = ht_snp.checkpoint(new_temp_file())

    df = ht_snp.key_by().drop('locus', 'alleles', 'variant',
                              'idx_snp').to_pandas()

    # annotate LD
    for pop in POPS:
        ht = hl.read_table(
            f'gs://gnomad-public-requester-pays/release/2.1.1/ld/gnomad.genomes.r2.1.1.{pop}.common.adj.ld.variant_indices.ht'
        )
        ht = ht.annotate(locus_hg38=hl.liftover(ht.locus, 'GRCh38'))
        ht = ht.filter(hl.is_defined(ht.locus_hg38))
        ht = ht.key_by('locus_hg38', 'alleles').drop('locus')
        ht = ht_snp.join(ht, 'inner')
        ht = ht.checkpoint(new_temp_file())

        lead_idx = ht.order_by(hl.desc(ht.prob)).head(1).idx.collect()
        idx = ht.idx.collect()
        bm = BlockMatrix.read(
            f'gs://gnomad-public-requester-pays/release/2.1.1/ld/gnomad.genomes.r2.1.1.{pop}.common.ld.bm'
        )
        bm = bm.filter(idx, idx)
        # re-densify triangluar matrix
        bm = bm + bm.T - get_diag_mat(bm.diagonal())
        bm = bm.filter_rows(
            np.where(np.array(idx) == lead_idx[0])[0].tolist())**2

        idx_snp = ht.idx_snp.collect()
        r2 = bm.to_numpy()[0]
        df[f'gnomad_lead_r2_{pop}'] = np.nan
        df[f'gnomad_lead_r2_{pop}'].iloc[idx_snp] = r2

    if args.out.startswith('gs://'):
        fopen = hl.hadoop_open
    else:
        fopen = open

    with fopen(args.out, 'w') as f:
        df.to_csv(f, sep='\t', na_rep='NA', index=False)
    mu1[tissue_name] = []
    sd[tissue_name] = []
    n[tissue_name] = []
    ems = hl.read_table("gs://qingbowang/ems_v1_test/ems_pcausal_gtexvg_all{0}.ht".format(tissue_name))
    ems = ems.annotate(hg38_ID = ems.vg.split("_")[0] + "_" + ems.vg.split("_")[1] + "_" +ems.vg.split("_")[2] + "_" +ems.vg.split("_")[3]).key_by("hg38_ID").select("p_causal", "confidence_gain")
    for categ in allcateg:
        comp = hl.read_table("gs://qingbowang/UKBB_nc_pp_susie_maxpip_{0}.ht".format(categ))
        ht0 = ems.join(comp, how="left")
        ht0 = ht0.filter(hl.is_defined(ht0.max_pip)) #remove those that does not contain complex trait information
        mu0[tissue_name].append(ht0.aggregate(hl.agg.mean(ht0.max_pip)))
        ht = ht0.filter(ht0.confidence_gain>10)#filter by confidence gain here to reduce n for order_by
        if ht.count()<10000:
            ht = ht0.filter(ht0.confidence_gain>1)
        if ht.count()<10000:
            ht = ht0.filter(ht0.confidence_gain>0.1)
        ht = ht.order_by(hl.desc(ht.p_causal))
        ht = ht.add_index()
        ht = ht.annotate(top10k = ht.idx<10000)
        ht = ht.filter(ht.top10k)
        st = ht.aggregate(hl.agg.stats(ht.max_pip))
        mu1[tissue_name].append(st.mean)
        sd[tissue_name].append(st.stdev)
        n[tissue_name].append(st.n)
        print ("done {0}, {1}".format(categ, tissue_name))
        print (mu0[tissue_name])
        print (st)
mu0 = pd.DataFrame(mu0)
mu1 = pd.DataFrame(mu1)
sd = pd.DataFrame(sd)
n = pd.DataFrame(n)
mu0.index = allcateg