def test_pchisqtail(): def right_tail_from_scipy(x, df, ncp): if ncp: return 1 - spst.ncx2.cdf(x, df, ncp) else: return 1 - spst.chi2.cdf(x, df) arglists = [[3, 1, 2], [5, 1, None], [1, 3, 4], [1, 3, None], [3, 6, 0], [3, 6, None]] for args in arglists: assert hl.eval(hl.pchisqtail(*args)) == pytest.approx( right_tail_from_scipy(*args)), args
def test(self): schema = hl.tstruct(a=hl.tint32, b=hl.tint32, c=hl.tint32, d=hl.tint32, e=hl.tstr, f=hl.tarray(hl.tint32), g=hl.tarray( hl.tstruct(x=hl.tint32, y=hl.tint32, z=hl.tstr)), h=hl.tstruct(a=hl.tint32, b=hl.tint32, c=hl.tstr), i=hl.tbool, j=hl.tstruct(x=hl.tint32, y=hl.tint32, z=hl.tstr)) rows = [{'a': 4, 'b': 1, 'c': 3, 'd': 5, 'e': "hello", 'f': [1, 2, 3], 'g': [hl.Struct(x=1, y=5, z='banana')], 'h': hl.Struct(a=5, b=3, c='winter'), 'i': True, 'j': hl.Struct(x=3, y=2, z='summer')}] kt = hl.Table.parallelize(rows, schema) result = convert_struct_to_dict(kt.annotate( chisq=hl.chisq(kt.a, kt.b, kt.c, kt.d), ctt=hl.ctt(kt.a, kt.b, kt.c, kt.d, 5), dict=hl.dict(hl.zip([kt.a, kt.b], [kt.c, kt.d])), dpois=hl.dpois(4, kt.a), drop=kt.h.drop('b', 'c'), exp=hl.exp(kt.c), fet=hl.fisher_exact_test(kt.a, kt.b, kt.c, kt.d), hwe=hl.hardy_weinberg_p(1, 2, 1), index=hl.index(kt.g, 'z'), is_defined=hl.is_defined(kt.i), is_missing=hl.is_missing(kt.i), is_nan=hl.is_nan(hl.float64(kt.a)), json=hl.json(kt.g), log=hl.log(kt.a, kt.b), log10=hl.log10(kt.c), or_else=hl.or_else(kt.a, 5), or_missing=hl.or_missing(kt.i, kt.j), pchisqtail=hl.pchisqtail(kt.a, kt.b), pcoin=hl.rand_bool(0.5), pnorm=hl.pnorm(0.2), pow=2.0 ** kt.b, ppois=hl.ppois(kt.a, kt.b), qchisqtail=hl.qchisqtail(kt.a, kt.b), range=hl.range(0, 5, kt.b), rnorm=hl.rand_norm(0.0, kt.b), rpois=hl.rand_pois(kt.a), runif=hl.rand_unif(kt.b, kt.a), select=kt.h.select('c', 'b'), sqrt=hl.sqrt(kt.a), to_str=[hl.str(5), hl.str(kt.a), hl.str(kt.g)], where=hl.cond(kt.i, 5, 10) ).take(1)[0])
def transmission_disequilibrium_test(dataset, pedigree) -> Table: r"""Performs the transmission disequilibrium test on trios. .. include:: ../_templates/req_tstring.rst .. include:: ../_templates/req_tvariant.rst .. include:: ../_templates/req_biallelic.rst Examples -------- Compute TDT association statistics and show the first two results: >>> pedigree = hl.Pedigree.read('data/tdt_trios.fam') >>> tdt_table = hl.transmission_disequilibrium_test(tdt_dataset, pedigree) >>> tdt_table.show(2) # doctest: +NOTEST +---------------+------------+-------+-------+----------+----------+ | locus | alleles | t | u | chi_sq | p_value | +---------------+------------+-------+-------+----------+----------+ | locus<GRCh37> | array<str> | int64 | int64 | float64 | float64 | +---------------+------------+-------+-------+----------+----------+ | 1:246714629 | ["C","A"] | 0 | 4 | 4.00e+00 | 4.55e-02 | | 2:167262169 | ["T","C"] | NA | NA | NA | NA | +---------------+------------+-------+-------+----------+----------+ Export variants with p-values below 0.001: >>> tdt_table = tdt_table.filter(tdt_table.p_value < 0.001) >>> tdt_table.export("output/tdt_results.tsv") Notes ----- The `transmission disequilibrium test <https://en.wikipedia.org/wiki/Transmission_disequilibrium_test#The_case_of_trios:_one_affected_child_per_family>`__ compares the number of times the alternate allele is transmitted (t) versus not transmitted (u) from a heterozgyous parent to an affected child. The null hypothesis holds that each case is equally likely. The TDT statistic is given by .. math:: (t - u)^2 \over (t + u) and asymptotically follows a chi-squared distribution with one degree of freedom under the null hypothesis. :func:`transmission_disequilibrium_test` only considers complete trios (two parents and a proband with defined sex) and only returns results for the autosome, as defined by :meth:`~hail.genetics.Locus.in_autosome`, and chromosome X. Transmissions and non-transmissions are counted only for the configurations of genotypes and copy state in the table below, in order to filter out Mendel errors and configurations where transmission is guaranteed. The copy state of a locus with respect to a trio is defined as follows: - Auto -- in autosome or in PAR of X or female child - HemiX -- in non-PAR of X and male child Here PAR is the `pseudoautosomal region <https://en.wikipedia.org/wiki/Pseudoautosomal_region>`__ of X and Y defined by :class:`.ReferenceGenome`, which many variant callers map to chromosome X. +--------+--------+--------+------------+---+---+ | Kid | Dad | Mom | Copy State | t | u | +========+========+========+============+===+===+ | HomRef | Het | Het | Auto | 0 | 2 | +--------+--------+--------+------------+---+---+ | HomRef | HomRef | Het | Auto | 0 | 1 | +--------+--------+--------+------------+---+---+ | HomRef | Het | HomRef | Auto | 0 | 1 | +--------+--------+--------+------------+---+---+ | Het | Het | Het | Auto | 1 | 1 | +--------+--------+--------+------------+---+---+ | Het | HomRef | Het | Auto | 1 | 0 | +--------+--------+--------+------------+---+---+ | Het | Het | HomRef | Auto | 1 | 0 | +--------+--------+--------+------------+---+---+ | Het | HomVar | Het | Auto | 0 | 1 | +--------+--------+--------+------------+---+---+ | Het | Het | HomVar | Auto | 0 | 1 | +--------+--------+--------+------------+---+---+ | HomVar | Het | Het | Auto | 2 | 0 | +--------+--------+--------+------------+---+---+ | HomVar | Het | HomVar | Auto | 1 | 0 | +--------+--------+--------+------------+---+---+ | HomVar | HomVar | Het | Auto | 1 | 0 | +--------+--------+--------+------------+---+---+ | HomRef | HomRef | Het | HemiX | 0 | 1 | +--------+--------+--------+------------+---+---+ | HomRef | HomVar | Het | HemiX | 0 | 1 | +--------+--------+--------+------------+---+---+ | HomVar | HomRef | Het | HemiX | 1 | 0 | +--------+--------+--------+------------+---+---+ | HomVar | HomVar | Het | HemiX | 1 | 0 | +--------+--------+--------+------------+---+---+ :func:`tdt` produces a table with the following columns: - `locus` (:class:`.tlocus`) -- Locus. - `alleles` (:class:`.tarray` of :py:data:`.tstr`) -- Alleles. - `t` (:py:data:`.tint32`) -- Number of transmitted alternate alleles. - `u` (:py:data:`.tint32`) -- Number of untransmitted alternate alleles. - `chi_sq` (:py:data:`.tfloat64`) -- TDT statistic. - `p_value` (:py:data:`.tfloat64`) -- p-value. Parameters ---------- dataset : :class:`.MatrixTable` Dataset. pedigree : :class:`~hail.genetics.Pedigree` Sample pedigree. Returns ------- :class:`.Table` Table of TDT results. """ dataset = require_biallelic(dataset, 'transmission_disequilibrium_test') dataset = dataset.annotate_rows(auto_or_x_par=dataset.locus.in_autosome() | dataset.locus.in_x_par()) dataset = dataset.filter_rows(dataset.auto_or_x_par | dataset.locus.in_x_nonpar()) hom_ref = 0 het = 1 hom_var = 2 auto = 2 hemi_x = 1 # kid, dad, mom, copy, t, u config_counts = [(hom_ref, het, het, auto, 0, 2), (hom_ref, hom_ref, het, auto, 0, 1), (hom_ref, het, hom_ref, auto, 0, 1), (het, het, het, auto, 1, 1), (het, hom_ref, het, auto, 1, 0), (het, het, hom_ref, auto, 1, 0), (het, hom_var, het, auto, 0, 1), (het, het, hom_var, auto, 0, 1), (hom_var, het, het, auto, 2, 0), (hom_var, het, hom_var, auto, 1, 0), (hom_var, hom_var, het, auto, 1, 0), (hom_ref, hom_ref, het, hemi_x, 0, 1), (hom_ref, hom_var, het, hemi_x, 0, 1), (hom_var, hom_ref, het, hemi_x, 1, 0), (hom_var, hom_var, het, hemi_x, 1, 0)] count_map = hl.literal({(c[0], c[1], c[2], c[3]): [c[4], c[5]] for c in config_counts}) tri = trio_matrix(dataset, pedigree, complete_trios=True) # this filter removes mendel error of het father in x_nonpar. It also avoids # building and looking up config in common case that neither parent is het father_is_het = tri.father_entry.GT.is_het() parent_is_valid_het = ((father_is_het & tri.auto_or_x_par) | (tri.mother_entry.GT.is_het() & ~father_is_het)) copy_state = hl.cond(tri.auto_or_x_par | tri.is_female, 2, 1) config = (tri.proband_entry.GT.n_alt_alleles(), tri.father_entry.GT.n_alt_alleles(), tri.mother_entry.GT.n_alt_alleles(), copy_state) tri = tri.annotate_rows(counts=agg.filter( parent_is_valid_het, agg.array_sum(count_map.get(config)))) tab = tri.rows().select('counts') tab = tab.transmute(t=tab.counts[0], u=tab.counts[1]) tab = tab.annotate(chi_sq=((tab.t - tab.u)**2) / (tab.t + tab.u)) tab = tab.annotate(p_value=hl.pchisqtail(tab.chi_sq, 1.0)) return tab.cache()
def transmission_disequilibrium_test(dataset, pedigree) -> Table: r"""Performs the transmission disequilibrium test on trios. .. include:: ../_templates/req_tstring.rst .. include:: ../_templates/req_tvariant.rst .. include:: ../_templates/req_biallelic.rst Examples -------- Compute TDT association statistics and show the first two results: >>> pedigree = hl.Pedigree.read('data/tdt_trios.fam') >>> tdt_table = hl.transmission_disequilibrium_test(tdt_dataset, pedigree) >>> tdt_table.show(2) # doctest: +NOTEST +---------------+------------+-------+-------+----------+----------+ | locus | alleles | t | u | chi_sq | p_value | +---------------+------------+-------+-------+----------+----------+ | locus<GRCh37> | array<str> | int64 | int64 | float64 | float64 | +---------------+------------+-------+-------+----------+----------+ | 1:246714629 | ["C","A"] | 0 | 4 | 4.00e+00 | 4.55e-02 | | 2:167262169 | ["T","C"] | NA | NA | NA | NA | +---------------+------------+-------+-------+----------+----------+ Export variants with p-values below 0.001: >>> tdt_table = tdt_table.filter(tdt_table.p_value < 0.001) >>> tdt_table.export("output/tdt_results.tsv") Notes ----- The `transmission disequilibrium test <https://en.wikipedia.org/wiki/Transmission_disequilibrium_test#The_case_of_trios:_one_affected_child_per_family>`__ compares the number of times the alternate allele is transmitted (t) versus not transmitted (u) from a heterozgyous parent to an affected child. The null hypothesis holds that each case is equally likely. The TDT statistic is given by .. math:: (t - u)^2 \over (t + u) and asymptotically follows a chi-squared distribution with one degree of freedom under the null hypothesis. :func:`transmission_disequilibrium_test` only considers complete trios (two parents and a proband with defined sex) and only returns results for the autosome, as defined by :meth:`~hail.genetics.Locus.in_autosome`, and chromosome X. Transmissions and non-transmissions are counted only for the configurations of genotypes and copy state in the table below, in order to filter out Mendel errors and configurations where transmission is guaranteed. The copy state of a locus with respect to a trio is defined as follows: - Auto -- in autosome or in PAR of X or female child - HemiX -- in non-PAR of X and male child Here PAR is the `pseudoautosomal region <https://en.wikipedia.org/wiki/Pseudoautosomal_region>`__ of X and Y defined by :class:`.ReferenceGenome`, which many variant callers map to chromosome X. +--------+--------+--------+------------+---+---+ | Kid | Dad | Mom | Copy State | t | u | +========+========+========+============+===+===+ | HomRef | Het | Het | Auto | 0 | 2 | +--------+--------+--------+------------+---+---+ | HomRef | HomRef | Het | Auto | 0 | 1 | +--------+--------+--------+------------+---+---+ | HomRef | Het | HomRef | Auto | 0 | 1 | +--------+--------+--------+------------+---+---+ | Het | Het | Het | Auto | 1 | 1 | +--------+--------+--------+------------+---+---+ | Het | HomRef | Het | Auto | 1 | 0 | +--------+--------+--------+------------+---+---+ | Het | Het | HomRef | Auto | 1 | 0 | +--------+--------+--------+------------+---+---+ | Het | HomVar | Het | Auto | 0 | 1 | +--------+--------+--------+------------+---+---+ | Het | Het | HomVar | Auto | 0 | 1 | +--------+--------+--------+------------+---+---+ | HomVar | Het | Het | Auto | 2 | 0 | +--------+--------+--------+------------+---+---+ | HomVar | Het | HomVar | Auto | 1 | 0 | +--------+--------+--------+------------+---+---+ | HomVar | HomVar | Het | Auto | 1 | 0 | +--------+--------+--------+------------+---+---+ | HomRef | HomRef | Het | HemiX | 0 | 1 | +--------+--------+--------+------------+---+---+ | HomRef | HomVar | Het | HemiX | 0 | 1 | +--------+--------+--------+------------+---+---+ | HomVar | HomRef | Het | HemiX | 1 | 0 | +--------+--------+--------+------------+---+---+ | HomVar | HomVar | Het | HemiX | 1 | 0 | +--------+--------+--------+------------+---+---+ :func:`tdt` produces a table with the following columns: - `locus` (:class:`.tlocus`) -- Locus. - `alleles` (:class:`.tarray` of :py:data:`.tstr`) -- Alleles. - `t` (:py:data:`.tint32`) -- Number of transmitted alternate alleles. - `u` (:py:data:`.tint32`) -- Number of untransmitted alternate alleles. - `chi_sq` (:py:data:`.tfloat64`) -- TDT statistic. - `p_value` (:py:data:`.tfloat64`) -- p-value. Parameters ---------- dataset : :class:`.MatrixTable` Dataset. pedigree : :class:`~hail.genetics.Pedigree` Sample pedigree. Returns ------- :class:`.Table` Table of TDT results. """ dataset = require_biallelic(dataset, 'transmission_disequilibrium_test') dataset = dataset.annotate_rows(auto_or_x_par = dataset.locus.in_autosome() | dataset.locus.in_x_par()) dataset = dataset.filter_rows(dataset.auto_or_x_par | dataset.locus.in_x_nonpar()) hom_ref = 0 het = 1 hom_var = 2 auto = 2 hemi_x = 1 # kid, dad, mom, copy, t, u config_counts = [(hom_ref, het, het, auto, 0, 2), (hom_ref, hom_ref, het, auto, 0, 1), (hom_ref, het, hom_ref, auto, 0, 1), ( het, het, het, auto, 1, 1), ( het, hom_ref, het, auto, 1, 0), ( het, het, hom_ref, auto, 1, 0), ( het, hom_var, het, auto, 0, 1), ( het, het, hom_var, auto, 0, 1), (hom_var, het, het, auto, 2, 0), (hom_var, het, hom_var, auto, 1, 0), (hom_var, hom_var, het, auto, 1, 0), (hom_ref, hom_ref, het, hemi_x, 0, 1), (hom_ref, hom_var, het, hemi_x, 0, 1), (hom_var, hom_ref, het, hemi_x, 1, 0), (hom_var, hom_var, het, hemi_x, 1, 0)] count_map = hl.literal({(c[0], c[1], c[2], c[3]): [c[4], c[5]] for c in config_counts}) tri = trio_matrix(dataset, pedigree, complete_trios=True) # this filter removes mendel error of het father in x_nonpar. It also avoids # building and looking up config in common case that neither parent is het father_is_het = tri.father_entry.GT.is_het() parent_is_valid_het = ((father_is_het & tri.auto_or_x_par) | (tri.mother_entry.GT.is_het() & ~father_is_het)) copy_state = hl.cond(tri.auto_or_x_par | tri.is_female, 2, 1) config = (tri.proband_entry.GT.n_alt_alleles(), tri.father_entry.GT.n_alt_alleles(), tri.mother_entry.GT.n_alt_alleles(), copy_state) tri = tri.annotate_rows(counts = agg.filter(parent_is_valid_het, agg.array_sum(count_map.get(config)))) tab = tri.rows().select('counts') tab = tab.transmute(t = tab.counts[0], u = tab.counts[1]) tab = tab.annotate(chi_sq = ((tab.t - tab.u) ** 2) / (tab.t + tab.u)) tab = tab.annotate(p_value = hl.pchisqtail(tab.chi_sq, 1.0)) return tab.cache()
def main(args): hl.init() # Read in all sumstats mt = load_final_sumstats_mt(filter_phenos=True, filter_variants=False, filter_sumstats=True, separate_columns_by_pop=False, annotate_with_nearest_gene=False) # Annotate per-entry sample size def get_n(pheno_data, i): return pheno_data[i].n_cases + hl.or_else(pheno_data[i].n_controls, 0) mt = mt.annotate_entries(summary_stats=hl.map( lambda x: x[1].annotate(N=hl.or_missing(hl.is_defined(x[1]), get_n(mt.pheno_data, x[0]))), hl.zip_with_index(mt.summary_stats))) # Exclude entries with low confidence flag. if not args.keep_low_confidence_variants: mt = mt.annotate_entries(summary_stats=hl.map( lambda x: hl.or_missing(~x.low_confidence, x), mt.summary_stats)) # Run fixed-effect meta-analysis (all + leave-one-out) mt = mt.annotate_entries(unnorm_beta=mt.summary_stats.BETA / (mt.summary_stats.SE**2), inv_se2=1 / (mt.summary_stats.SE**2)) mt = mt.annotate_entries( sum_unnorm_beta=all_and_leave_one_out(mt.unnorm_beta, mt.pheno_data.pop), sum_inv_se2=all_and_leave_one_out(mt.inv_se2, mt.pheno_data.pop)) mt = mt.transmute_entries(META_BETA=mt.sum_unnorm_beta / mt.sum_inv_se2, META_SE=hl.map(lambda x: hl.sqrt(1 / x), mt.sum_inv_se2)) mt = mt.annotate_entries( META_Pvalue=hl.map(lambda x: 2 * hl.pnorm(x), -hl.abs(mt.META_BETA / mt.META_SE))) # Run heterogeneity test (Cochran's Q) mt = mt.annotate_entries(META_Q=hl.map( lambda x: hl.sum((mt.summary_stats.BETA - x)**2 * mt.inv_se2), mt.META_BETA), variant_exists=hl.map(lambda x: ~hl.is_missing(x), mt.summary_stats.BETA)) mt = mt.annotate_entries(META_N_pops=all_and_leave_one_out( mt.variant_exists, mt.pheno_data.pop)) mt = mt.annotate_entries(META_Pvalue_het=hl.map( lambda i: hl.pchisqtail(mt.META_Q[i], mt.META_N_pops[i] - 1), hl.range(hl.len(mt.META_Q)))) # Add other annotations mt = mt.annotate_entries( ac_cases=hl.map(lambda x: x["AF.Cases"] * x.N, mt.summary_stats), ac_controls=hl.map(lambda x: x["AF.Controls"] * x.N, mt.summary_stats), META_AC_Allele2=all_and_leave_one_out( mt.summary_stats.AF_Allele2 * mt.summary_stats.N, mt.pheno_data.pop), META_N=all_and_leave_one_out(mt.summary_stats.N, mt.pheno_data.pop)) mt = mt.annotate_entries( META_AF_Allele2=mt.META_AC_Allele2 / mt.META_N, META_AF_Cases=all_and_leave_one_out(mt.ac_cases, mt.pheno_data.pop) / mt.META_N, META_AF_Controls=all_and_leave_one_out(mt.ac_controls, mt.pheno_data.pop) / mt.META_N) mt = mt.drop('unnorm_beta', 'inv_se2', 'variant_exists', 'ac_cases', 'ac_controls', 'summary_stats', 'META_AC_Allele2') # Format everything into array<struct> def is_finite_or_missing(x): return (hl.or_missing(hl.is_finite(x), x)) meta_fields = [ 'BETA', 'SE', 'Pvalue', 'Q', 'Pvalue_het', 'N', 'N_pops', 'AF_Allele2', 'AF_Cases', 'AF_Controls' ] mt = mt.transmute_entries(meta_analysis=hl.map( lambda i: hl.struct( **{ field: is_finite_or_missing(mt[f'META_{field}'][i]) for field in meta_fields }), hl.range(hl.len(mt.META_BETA)))) col_fields = ['n_cases', 'n_controls'] mt = mt.annotate_cols( **{ field: all_and_leave_one_out(mt.pheno_data[field], mt.pheno_data.pop) for field in col_fields }) col_fields += ['pop'] mt = mt.annotate_cols(pop=all_and_leave_one_out( mt.pheno_data.pop, mt.pheno_data.pop, all_f=lambda x: x, loo_f=lambda i, x: hl.filter(lambda y: y != x[i], x), )) mt = mt.transmute_cols(meta_analysis_data=hl.map( lambda i: hl.struct(**{field: mt[field][i] for field in col_fields}), hl.range(hl.len(mt.pop)))) mt.describe() mt.write(get_meta_analysis_results_path(), overwrite=args.overwrite) hl.copy_log('gs://ukb-diverse-pops/combined_results/meta_analysis.log')