Esempio n. 1
0
def test_pchisqtail():
    def right_tail_from_scipy(x, df, ncp):
        if ncp:
            return 1 - spst.ncx2.cdf(x, df, ncp)
        else:
            return 1 - spst.chi2.cdf(x, df)

    arglists = [[3, 1, 2], [5, 1, None], [1, 3, 4], [1, 3, None], [3, 6, 0],
                [3, 6, None]]

    for args in arglists:
        assert hl.eval(hl.pchisqtail(*args)) == pytest.approx(
            right_tail_from_scipy(*args)), args
Esempio n. 2
0
    def test(self):
        schema = hl.tstruct(a=hl.tint32, b=hl.tint32, c=hl.tint32, d=hl.tint32, e=hl.tstr,
                            f=hl.tarray(hl.tint32),
                            g=hl.tarray(
                                hl.tstruct(x=hl.tint32, y=hl.tint32, z=hl.tstr)),
                            h=hl.tstruct(a=hl.tint32, b=hl.tint32, c=hl.tstr),
                            i=hl.tbool,
                            j=hl.tstruct(x=hl.tint32, y=hl.tint32, z=hl.tstr))

        rows = [{'a': 4, 'b': 1, 'c': 3, 'd': 5,
                 'e': "hello", 'f': [1, 2, 3],
                 'g': [hl.Struct(x=1, y=5, z='banana')],
                 'h': hl.Struct(a=5, b=3, c='winter'),
                 'i': True,
                 'j': hl.Struct(x=3, y=2, z='summer')}]

        kt = hl.Table.parallelize(rows, schema)

        result = convert_struct_to_dict(kt.annotate(
            chisq=hl.chisq(kt.a, kt.b, kt.c, kt.d),
            ctt=hl.ctt(kt.a, kt.b, kt.c, kt.d, 5),
            dict=hl.dict(hl.zip([kt.a, kt.b], [kt.c, kt.d])),
            dpois=hl.dpois(4, kt.a),
            drop=kt.h.drop('b', 'c'),
            exp=hl.exp(kt.c),
            fet=hl.fisher_exact_test(kt.a, kt.b, kt.c, kt.d),
            hwe=hl.hardy_weinberg_p(1, 2, 1),
            index=hl.index(kt.g, 'z'),
            is_defined=hl.is_defined(kt.i),
            is_missing=hl.is_missing(kt.i),
            is_nan=hl.is_nan(hl.float64(kt.a)),
            json=hl.json(kt.g),
            log=hl.log(kt.a, kt.b),
            log10=hl.log10(kt.c),
            or_else=hl.or_else(kt.a, 5),
            or_missing=hl.or_missing(kt.i, kt.j),
            pchisqtail=hl.pchisqtail(kt.a, kt.b),
            pcoin=hl.rand_bool(0.5),
            pnorm=hl.pnorm(0.2),
            pow=2.0 ** kt.b,
            ppois=hl.ppois(kt.a, kt.b),
            qchisqtail=hl.qchisqtail(kt.a, kt.b),
            range=hl.range(0, 5, kt.b),
            rnorm=hl.rand_norm(0.0, kt.b),
            rpois=hl.rand_pois(kt.a),
            runif=hl.rand_unif(kt.b, kt.a),
            select=kt.h.select('c', 'b'),
            sqrt=hl.sqrt(kt.a),
            to_str=[hl.str(5), hl.str(kt.a), hl.str(kt.g)],
            where=hl.cond(kt.i, 5, 10)
        ).take(1)[0])
Esempio n. 3
0
def transmission_disequilibrium_test(dataset, pedigree) -> Table:
    r"""Performs the transmission disequilibrium test on trios.

    .. include:: ../_templates/req_tstring.rst

    .. include:: ../_templates/req_tvariant.rst

    .. include:: ../_templates/req_biallelic.rst

    Examples
    --------
    Compute TDT association statistics and show the first two results:
    
    >>> pedigree = hl.Pedigree.read('data/tdt_trios.fam')
    >>> tdt_table = hl.transmission_disequilibrium_test(tdt_dataset, pedigree)
    >>> tdt_table.show(2)  # doctest: +NOTEST
    +---------------+------------+-------+-------+----------+----------+
    | locus         | alleles    |     t |     u |   chi_sq |  p_value |
    +---------------+------------+-------+-------+----------+----------+
    | locus<GRCh37> | array<str> | int64 | int64 |  float64 |  float64 |
    +---------------+------------+-------+-------+----------+----------+
    | 1:246714629   | ["C","A"]  |     0 |     4 | 4.00e+00 | 4.55e-02 |
    | 2:167262169   | ["T","C"]  |    NA |    NA |       NA |       NA |
    +---------------+------------+-------+-------+----------+----------+

    Export variants with p-values below 0.001:

    >>> tdt_table = tdt_table.filter(tdt_table.p_value < 0.001)
    >>> tdt_table.export("output/tdt_results.tsv")

    Notes
    -----
    The
    `transmission disequilibrium test <https://en.wikipedia.org/wiki/Transmission_disequilibrium_test#The_case_of_trios:_one_affected_child_per_family>`__
    compares the number of times the alternate allele is transmitted (t) versus
    not transmitted (u) from a heterozgyous parent to an affected child. The null
    hypothesis holds that each case is equally likely. The TDT statistic is given by

    .. math::

        (t - u)^2 \over (t + u)

    and asymptotically follows a chi-squared distribution with one degree of
    freedom under the null hypothesis.

    :func:`transmission_disequilibrium_test` only considers complete trios (two
    parents and a proband with defined sex) and only returns results for the
    autosome, as defined by :meth:`~hail.genetics.Locus.in_autosome`, and
    chromosome X. Transmissions and non-transmissions are counted only for the
    configurations of genotypes and copy state in the table below, in order to
    filter out Mendel errors and configurations where transmission is
    guaranteed. The copy state of a locus with respect to a trio is defined as
    follows:

    - Auto -- in autosome or in PAR of X or female child
    - HemiX -- in non-PAR of X and male child

    Here PAR is the `pseudoautosomal region
    <https://en.wikipedia.org/wiki/Pseudoautosomal_region>`__
    of X and Y defined by :class:`.ReferenceGenome`, which many variant callers
    map to chromosome X.

    +--------+--------+--------+------------+---+---+
    |  Kid   | Dad    | Mom    | Copy State | t | u |
    +========+========+========+============+===+===+
    | HomRef | Het    | Het    | Auto       | 0 | 2 |
    +--------+--------+--------+------------+---+---+
    | HomRef | HomRef | Het    | Auto       | 0 | 1 |
    +--------+--------+--------+------------+---+---+
    | HomRef | Het    | HomRef | Auto       | 0 | 1 |
    +--------+--------+--------+------------+---+---+
    | Het    | Het    | Het    | Auto       | 1 | 1 |
    +--------+--------+--------+------------+---+---+
    | Het    | HomRef | Het    | Auto       | 1 | 0 |
    +--------+--------+--------+------------+---+---+
    | Het    | Het    | HomRef | Auto       | 1 | 0 |
    +--------+--------+--------+------------+---+---+
    | Het    | HomVar | Het    | Auto       | 0 | 1 |
    +--------+--------+--------+------------+---+---+
    | Het    | Het    | HomVar | Auto       | 0 | 1 |
    +--------+--------+--------+------------+---+---+
    | HomVar | Het    | Het    | Auto       | 2 | 0 |
    +--------+--------+--------+------------+---+---+
    | HomVar | Het    | HomVar | Auto       | 1 | 0 |
    +--------+--------+--------+------------+---+---+
    | HomVar | HomVar | Het    | Auto       | 1 | 0 |
    +--------+--------+--------+------------+---+---+
    | HomRef | HomRef | Het    | HemiX      | 0 | 1 |
    +--------+--------+--------+------------+---+---+
    | HomRef | HomVar | Het    | HemiX      | 0 | 1 |
    +--------+--------+--------+------------+---+---+
    | HomVar | HomRef | Het    | HemiX      | 1 | 0 |
    +--------+--------+--------+------------+---+---+
    | HomVar | HomVar | Het    | HemiX      | 1 | 0 |
    +--------+--------+--------+------------+---+---+

    :func:`tdt` produces a table with the following columns:

     - `locus` (:class:`.tlocus`) -- Locus.
     - `alleles` (:class:`.tarray` of :py:data:`.tstr`) -- Alleles.
     - `t` (:py:data:`.tint32`) -- Number of transmitted alternate alleles.
     - `u` (:py:data:`.tint32`) -- Number of untransmitted alternate alleles.
     - `chi_sq` (:py:data:`.tfloat64`) -- TDT statistic.
     - `p_value` (:py:data:`.tfloat64`) -- p-value.

    Parameters
    ----------
    dataset : :class:`.MatrixTable`
        Dataset.
    pedigree : :class:`~hail.genetics.Pedigree`
        Sample pedigree.

    Returns
    -------
    :class:`.Table`
        Table of TDT results.
    """

    dataset = require_biallelic(dataset, 'transmission_disequilibrium_test')
    dataset = dataset.annotate_rows(auto_or_x_par=dataset.locus.in_autosome()
                                    | dataset.locus.in_x_par())
    dataset = dataset.filter_rows(dataset.auto_or_x_par
                                  | dataset.locus.in_x_nonpar())

    hom_ref = 0
    het = 1
    hom_var = 2

    auto = 2
    hemi_x = 1

    #                     kid,     dad,     mom,   copy, t, u
    config_counts = [(hom_ref, het, het, auto, 0, 2),
                     (hom_ref, hom_ref, het, auto, 0, 1),
                     (hom_ref, het, hom_ref, auto, 0, 1),
                     (het, het, het, auto, 1, 1),
                     (het, hom_ref, het, auto, 1, 0),
                     (het, het, hom_ref, auto, 1, 0),
                     (het, hom_var, het, auto, 0, 1),
                     (het, het, hom_var, auto, 0, 1),
                     (hom_var, het, het, auto, 2, 0),
                     (hom_var, het, hom_var, auto, 1, 0),
                     (hom_var, hom_var, het, auto, 1, 0),
                     (hom_ref, hom_ref, het, hemi_x, 0, 1),
                     (hom_ref, hom_var, het, hemi_x, 0, 1),
                     (hom_var, hom_ref, het, hemi_x, 1, 0),
                     (hom_var, hom_var, het, hemi_x, 1, 0)]

    count_map = hl.literal({(c[0], c[1], c[2], c[3]): [c[4], c[5]]
                            for c in config_counts})

    tri = trio_matrix(dataset, pedigree, complete_trios=True)

    # this filter removes mendel error of het father in x_nonpar. It also avoids
    #   building and looking up config in common case that neither parent is het
    father_is_het = tri.father_entry.GT.is_het()
    parent_is_valid_het = ((father_is_het & tri.auto_or_x_par) |
                           (tri.mother_entry.GT.is_het() & ~father_is_het))

    copy_state = hl.cond(tri.auto_or_x_par | tri.is_female, 2, 1)

    config = (tri.proband_entry.GT.n_alt_alleles(),
              tri.father_entry.GT.n_alt_alleles(),
              tri.mother_entry.GT.n_alt_alleles(), copy_state)

    tri = tri.annotate_rows(counts=agg.filter(
        parent_is_valid_het, agg.array_sum(count_map.get(config))))

    tab = tri.rows().select('counts')
    tab = tab.transmute(t=tab.counts[0], u=tab.counts[1])
    tab = tab.annotate(chi_sq=((tab.t - tab.u)**2) / (tab.t + tab.u))
    tab = tab.annotate(p_value=hl.pchisqtail(tab.chi_sq, 1.0))

    return tab.cache()
Esempio n. 4
0
def transmission_disequilibrium_test(dataset, pedigree) -> Table:
    r"""Performs the transmission disequilibrium test on trios.

    .. include:: ../_templates/req_tstring.rst

    .. include:: ../_templates/req_tvariant.rst

    .. include:: ../_templates/req_biallelic.rst

    Examples
    --------
    Compute TDT association statistics and show the first two results:
    
    >>> pedigree = hl.Pedigree.read('data/tdt_trios.fam')
    >>> tdt_table = hl.transmission_disequilibrium_test(tdt_dataset, pedigree)
    >>> tdt_table.show(2)  # doctest: +NOTEST
    +---------------+------------+-------+-------+----------+----------+
    | locus         | alleles    |     t |     u |   chi_sq |  p_value |
    +---------------+------------+-------+-------+----------+----------+
    | locus<GRCh37> | array<str> | int64 | int64 |  float64 |  float64 |
    +---------------+------------+-------+-------+----------+----------+
    | 1:246714629   | ["C","A"]  |     0 |     4 | 4.00e+00 | 4.55e-02 |
    | 2:167262169   | ["T","C"]  |    NA |    NA |       NA |       NA |
    +---------------+------------+-------+-------+----------+----------+

    Export variants with p-values below 0.001:

    >>> tdt_table = tdt_table.filter(tdt_table.p_value < 0.001)
    >>> tdt_table.export("output/tdt_results.tsv")

    Notes
    -----
    The
    `transmission disequilibrium test <https://en.wikipedia.org/wiki/Transmission_disequilibrium_test#The_case_of_trios:_one_affected_child_per_family>`__
    compares the number of times the alternate allele is transmitted (t) versus
    not transmitted (u) from a heterozgyous parent to an affected child. The null
    hypothesis holds that each case is equally likely. The TDT statistic is given by

    .. math::

        (t - u)^2 \over (t + u)

    and asymptotically follows a chi-squared distribution with one degree of
    freedom under the null hypothesis.

    :func:`transmission_disequilibrium_test` only considers complete trios (two
    parents and a proband with defined sex) and only returns results for the
    autosome, as defined by :meth:`~hail.genetics.Locus.in_autosome`, and
    chromosome X. Transmissions and non-transmissions are counted only for the
    configurations of genotypes and copy state in the table below, in order to
    filter out Mendel errors and configurations where transmission is
    guaranteed. The copy state of a locus with respect to a trio is defined as
    follows:

    - Auto -- in autosome or in PAR of X or female child
    - HemiX -- in non-PAR of X and male child

    Here PAR is the `pseudoautosomal region
    <https://en.wikipedia.org/wiki/Pseudoautosomal_region>`__
    of X and Y defined by :class:`.ReferenceGenome`, which many variant callers
    map to chromosome X.

    +--------+--------+--------+------------+---+---+
    |  Kid   | Dad    | Mom    | Copy State | t | u |
    +========+========+========+============+===+===+
    | HomRef | Het    | Het    | Auto       | 0 | 2 |
    +--------+--------+--------+------------+---+---+
    | HomRef | HomRef | Het    | Auto       | 0 | 1 |
    +--------+--------+--------+------------+---+---+
    | HomRef | Het    | HomRef | Auto       | 0 | 1 |
    +--------+--------+--------+------------+---+---+
    | Het    | Het    | Het    | Auto       | 1 | 1 |
    +--------+--------+--------+------------+---+---+
    | Het    | HomRef | Het    | Auto       | 1 | 0 |
    +--------+--------+--------+------------+---+---+
    | Het    | Het    | HomRef | Auto       | 1 | 0 |
    +--------+--------+--------+------------+---+---+
    | Het    | HomVar | Het    | Auto       | 0 | 1 |
    +--------+--------+--------+------------+---+---+
    | Het    | Het    | HomVar | Auto       | 0 | 1 |
    +--------+--------+--------+------------+---+---+
    | HomVar | Het    | Het    | Auto       | 2 | 0 |
    +--------+--------+--------+------------+---+---+
    | HomVar | Het    | HomVar | Auto       | 1 | 0 |
    +--------+--------+--------+------------+---+---+
    | HomVar | HomVar | Het    | Auto       | 1 | 0 |
    +--------+--------+--------+------------+---+---+
    | HomRef | HomRef | Het    | HemiX      | 0 | 1 |
    +--------+--------+--------+------------+---+---+
    | HomRef | HomVar | Het    | HemiX      | 0 | 1 |
    +--------+--------+--------+------------+---+---+
    | HomVar | HomRef | Het    | HemiX      | 1 | 0 |
    +--------+--------+--------+------------+---+---+
    | HomVar | HomVar | Het    | HemiX      | 1 | 0 |
    +--------+--------+--------+------------+---+---+

    :func:`tdt` produces a table with the following columns:

     - `locus` (:class:`.tlocus`) -- Locus.
     - `alleles` (:class:`.tarray` of :py:data:`.tstr`) -- Alleles.
     - `t` (:py:data:`.tint32`) -- Number of transmitted alternate alleles.
     - `u` (:py:data:`.tint32`) -- Number of untransmitted alternate alleles.
     - `chi_sq` (:py:data:`.tfloat64`) -- TDT statistic.
     - `p_value` (:py:data:`.tfloat64`) -- p-value.

    Parameters
    ----------
    dataset : :class:`.MatrixTable`
        Dataset.
    pedigree : :class:`~hail.genetics.Pedigree`
        Sample pedigree.

    Returns
    -------
    :class:`.Table`
        Table of TDT results.
    """

    dataset = require_biallelic(dataset, 'transmission_disequilibrium_test')
    dataset = dataset.annotate_rows(auto_or_x_par = dataset.locus.in_autosome() | dataset.locus.in_x_par())
    dataset = dataset.filter_rows(dataset.auto_or_x_par | dataset.locus.in_x_nonpar())

    hom_ref = 0
    het = 1
    hom_var = 2

    auto = 2
    hemi_x = 1

    #                     kid,     dad,     mom,   copy, t, u
    config_counts = [(hom_ref,     het,     het,   auto, 0, 2),
                     (hom_ref, hom_ref,     het,   auto, 0, 1),
                     (hom_ref,     het, hom_ref,   auto, 0, 1),
                     (    het,     het,     het,   auto, 1, 1),
                     (    het, hom_ref,     het,   auto, 1, 0),
                     (    het,     het, hom_ref,   auto, 1, 0),
                     (    het, hom_var,     het,   auto, 0, 1),
                     (    het,     het, hom_var,   auto, 0, 1),
                     (hom_var,     het,     het,   auto, 2, 0),
                     (hom_var,     het, hom_var,   auto, 1, 0),
                     (hom_var, hom_var,     het,   auto, 1, 0),
                     (hom_ref, hom_ref,     het, hemi_x, 0, 1),
                     (hom_ref, hom_var,     het, hemi_x, 0, 1),
                     (hom_var, hom_ref,     het, hemi_x, 1, 0),
                     (hom_var, hom_var,     het, hemi_x, 1, 0)]

    count_map = hl.literal({(c[0], c[1], c[2], c[3]): [c[4], c[5]] for c in config_counts})

    tri = trio_matrix(dataset, pedigree, complete_trios=True)

    # this filter removes mendel error of het father in x_nonpar. It also avoids
    #   building and looking up config in common case that neither parent is het
    father_is_het = tri.father_entry.GT.is_het()
    parent_is_valid_het = ((father_is_het & tri.auto_or_x_par) |
                           (tri.mother_entry.GT.is_het() & ~father_is_het))

    copy_state = hl.cond(tri.auto_or_x_par | tri.is_female, 2, 1)

    config = (tri.proband_entry.GT.n_alt_alleles(),
              tri.father_entry.GT.n_alt_alleles(),
              tri.mother_entry.GT.n_alt_alleles(),
              copy_state)

    tri = tri.annotate_rows(counts = agg.filter(parent_is_valid_het, agg.array_sum(count_map.get(config))))

    tab = tri.rows().select('counts')
    tab = tab.transmute(t = tab.counts[0], u = tab.counts[1])
    tab = tab.annotate(chi_sq = ((tab.t - tab.u) ** 2) / (tab.t + tab.u))
    tab = tab.annotate(p_value = hl.pchisqtail(tab.chi_sq, 1.0))

    return tab.cache()
Esempio n. 5
0
def main(args):
    hl.init()

    # Read in all sumstats
    mt = load_final_sumstats_mt(filter_phenos=True,
                                filter_variants=False,
                                filter_sumstats=True,
                                separate_columns_by_pop=False,
                                annotate_with_nearest_gene=False)

    # Annotate per-entry sample size
    def get_n(pheno_data, i):
        return pheno_data[i].n_cases + hl.or_else(pheno_data[i].n_controls, 0)

    mt = mt.annotate_entries(summary_stats=hl.map(
        lambda x: x[1].annotate(N=hl.or_missing(hl.is_defined(x[1]),
                                                get_n(mt.pheno_data, x[0]))),
        hl.zip_with_index(mt.summary_stats)))

    # Exclude entries with low confidence flag.
    if not args.keep_low_confidence_variants:
        mt = mt.annotate_entries(summary_stats=hl.map(
            lambda x: hl.or_missing(~x.low_confidence, x), mt.summary_stats))

    # Run fixed-effect meta-analysis (all + leave-one-out)
    mt = mt.annotate_entries(unnorm_beta=mt.summary_stats.BETA /
                             (mt.summary_stats.SE**2),
                             inv_se2=1 / (mt.summary_stats.SE**2))
    mt = mt.annotate_entries(
        sum_unnorm_beta=all_and_leave_one_out(mt.unnorm_beta,
                                              mt.pheno_data.pop),
        sum_inv_se2=all_and_leave_one_out(mt.inv_se2, mt.pheno_data.pop))
    mt = mt.transmute_entries(META_BETA=mt.sum_unnorm_beta / mt.sum_inv_se2,
                              META_SE=hl.map(lambda x: hl.sqrt(1 / x),
                                             mt.sum_inv_se2))
    mt = mt.annotate_entries(
        META_Pvalue=hl.map(lambda x: 2 * hl.pnorm(x), -hl.abs(mt.META_BETA /
                                                              mt.META_SE)))

    # Run heterogeneity test (Cochran's Q)
    mt = mt.annotate_entries(META_Q=hl.map(
        lambda x: hl.sum((mt.summary_stats.BETA - x)**2 * mt.inv_se2),
        mt.META_BETA),
                             variant_exists=hl.map(lambda x: ~hl.is_missing(x),
                                                   mt.summary_stats.BETA))
    mt = mt.annotate_entries(META_N_pops=all_and_leave_one_out(
        mt.variant_exists, mt.pheno_data.pop))
    mt = mt.annotate_entries(META_Pvalue_het=hl.map(
        lambda i: hl.pchisqtail(mt.META_Q[i], mt.META_N_pops[i] - 1),
        hl.range(hl.len(mt.META_Q))))

    # Add other annotations
    mt = mt.annotate_entries(
        ac_cases=hl.map(lambda x: x["AF.Cases"] * x.N, mt.summary_stats),
        ac_controls=hl.map(lambda x: x["AF.Controls"] * x.N, mt.summary_stats),
        META_AC_Allele2=all_and_leave_one_out(
            mt.summary_stats.AF_Allele2 * mt.summary_stats.N,
            mt.pheno_data.pop),
        META_N=all_and_leave_one_out(mt.summary_stats.N, mt.pheno_data.pop))
    mt = mt.annotate_entries(
        META_AF_Allele2=mt.META_AC_Allele2 / mt.META_N,
        META_AF_Cases=all_and_leave_one_out(mt.ac_cases, mt.pheno_data.pop) /
        mt.META_N,
        META_AF_Controls=all_and_leave_one_out(mt.ac_controls,
                                               mt.pheno_data.pop) / mt.META_N)
    mt = mt.drop('unnorm_beta', 'inv_se2', 'variant_exists', 'ac_cases',
                 'ac_controls', 'summary_stats', 'META_AC_Allele2')

    # Format everything into array<struct>
    def is_finite_or_missing(x):
        return (hl.or_missing(hl.is_finite(x), x))

    meta_fields = [
        'BETA', 'SE', 'Pvalue', 'Q', 'Pvalue_het', 'N', 'N_pops', 'AF_Allele2',
        'AF_Cases', 'AF_Controls'
    ]
    mt = mt.transmute_entries(meta_analysis=hl.map(
        lambda i: hl.struct(
            **{
                field: is_finite_or_missing(mt[f'META_{field}'][i])
                for field in meta_fields
            }), hl.range(hl.len(mt.META_BETA))))

    col_fields = ['n_cases', 'n_controls']
    mt = mt.annotate_cols(
        **{
            field: all_and_leave_one_out(mt.pheno_data[field],
                                         mt.pheno_data.pop)
            for field in col_fields
        })
    col_fields += ['pop']
    mt = mt.annotate_cols(pop=all_and_leave_one_out(
        mt.pheno_data.pop,
        mt.pheno_data.pop,
        all_f=lambda x: x,
        loo_f=lambda i, x: hl.filter(lambda y: y != x[i], x),
    ))
    mt = mt.transmute_cols(meta_analysis_data=hl.map(
        lambda i: hl.struct(**{field: mt[field][i]
                               for field in col_fields}),
        hl.range(hl.len(mt.pop))))

    mt.describe()
    mt.write(get_meta_analysis_results_path(), overwrite=args.overwrite)

    hl.copy_log('gs://ukb-diverse-pops/combined_results/meta_analysis.log')