def combine(ts): # pylint: disable=protected-access tmp = ts.annotate( alleles=merge_alleles(ts.data.map(lambda d: d.alleles)), rsid=hl.find(hl.is_defined, ts.data.map(lambda d: d.rsid)), filters=hl.set(hl.flatten(ts.data.map(lambda d: hl.array(d.filters)))), info=hl.struct( DP=hl.sum(ts.data.map(lambda d: d.info.DP)), MQ_DP=hl.sum(ts.data.map(lambda d: d.info.MQ_DP)), QUALapprox=hl.sum(ts.data.map(lambda d: d.info.QUALapprox)), RAW_MQ=hl.sum(ts.data.map(lambda d: d.info.RAW_MQ)), VarDP=hl.sum(ts.data.map(lambda d: d.info.VarDP)), SB=hl.array([ hl.sum(ts.data.map(lambda d: d.info.SB[0])), hl.sum(ts.data.map(lambda d: d.info.SB[1])), hl.sum(ts.data.map(lambda d: d.info.SB[2])), hl.sum(ts.data.map(lambda d: d.info.SB[3])) ]))) tmp = tmp.annotate( __entries=hl.bind( lambda combined_allele_index: hl.range(0, hl.len(tmp.data)).flatmap( lambda i: hl.cond(hl.is_missing(tmp.data[i].__entries), hl.range(0, hl.len(tmp.g[i].__cols)) .map(lambda _: hl.null(tmp.data[i].__entries.dtype.element_type)), hl.bind( lambda old_to_new: tmp.data[i].__entries.map(lambda e: renumber_entry(e, old_to_new)), hl.range(0, hl.len(tmp.data[i].alleles)).map( lambda j: combined_allele_index[tmp.data[i].alleles[j]])))), hl.dict(hl.range(0, hl.len(tmp.alleles)).map( lambda j: hl.tuple([tmp.alleles[j], j]))))) tmp = tmp.annotate_globals(__cols=hl.flatten(tmp.g.map(lambda g: g.__cols))) return tmp.drop('data', 'g')
def allele_type(ref, alt): return hl.bind(lambda at: hl.cond(at == allele_ints['SNP'], hl.cond(hl.is_transition(ref, alt), allele_ints['Transition'], allele_ints['Transversion']), at), _num_allele_type(ref, alt))
def struct_from_min_rep(i): return hl.bind(lambda mr: (hl.case() .when(ds.locus == mr.locus, hl.struct( locus=ds.locus, alleles=[mr.alleles[0], mr.alleles[1]], a_index=i, was_split=True)) .or_error("Found non-left-aligned variant in sparse_split_multi")), hl.min_rep(ds.locus, [ds.alleles[0], ds.alleles[i]]))
def histogram(data, range=None, bins=50, legend=None, title=None): """Create a histogram. Parameters ---------- data : :class:`.Struct` or :class:`.Float64Expression` Sequence of data to plot. range : Tuple[float] Range of x values in the histogram. bins : int Number of bins in the histogram. legend : str Label of data on the x-axis. title : str Title of the histogram. Returns ------- :class:`bokeh.plotting.figure.Figure` """ if isinstance(data, Expression): if data._indices.source is not None: agg_f = data._aggregation_method() if range is not None: start = range[0] end = range[1] else: finite_data = hail.bind(lambda x: hail.case().when(hail.is_finite(x), x).or_missing(), data) start, end = agg_f((aggregators.min(finite_data), aggregators.max(finite_data))) if start is None and end is None: raise ValueError(f"'data' contains no values that are defined and finite") data = agg_f(aggregators.hist(data, start, end, bins)) else: return ValueError('Invalid input') p = figure(title=title, x_axis_label=legend, y_axis_label='Frequency', background_fill_color='#EEEEEE') p.quad( bottom=0, top=data.bin_freq, left=data.bin_edges[:-1], right=data.bin_edges[1:], legend=legend, line_color='black') if data.n_larger > 0: p.quad( bottom=0, top=data.n_larger, left=data.bin_edges[-1], right=(data.bin_edges[-1] + (data.bin_edges[1] - data.bin_edges[0])), line_color='black', fill_color='green', legend='Outliers Above') if data.n_smaller > 0: p.quad( bottom=0, top=data.n_smaller, left=data.bin_edges[0] - (data.bin_edges[1] - data.bin_edges[0]), right=data.bin_edges[0], line_color='black', fill_color='red', legend='Outliers Below') return p
def transform_entries(old_entry): def with_local_a_index(local_a_index): new_pl = hl.or_missing( hl.is_defined(old_entry.LPL), hl.or_missing( hl.is_defined(local_a_index), hl.range(0, 3).map(lambda i: hl.min( hl.range(0, hl.triangle(hl.len(old_entry.LA))) .filter(lambda j: hl.downcode(hl.unphased_diploid_gt_index_call(j), local_a_index) == hl.unphased_diploid_gt_index_call(i)) .map(lambda idx: old_entry.LPL[idx]))))) fields = set(old_entry.keys()) def with_pl(pl): new_exprs = {} dropped_fields = ['LA'] if 'LGT' in fields: new_exprs['GT'] = hl.downcode(old_entry.LGT, hl.or_else(local_a_index, hl.len(old_entry.LA))) dropped_fields.append('LGT') if 'LPGT' in fields: new_exprs['PGT'] = hl.downcode(old_entry.LPGT, hl.or_else(local_a_index, hl.len(old_entry.LA))) dropped_fields.append('LPGT') if 'LAD' in fields: new_exprs['AD'] = hl.or_missing( hl.is_defined(old_entry.LAD), [old_entry.LAD[0], hl.or_else(old_entry.LAD[local_a_index], 0)]) # second entry zeroed for lack of non-ref AD dropped_fields.append('LAD') if 'LPL' in fields: new_exprs['PL'] = pl if 'GQ' in fields: new_exprs['GQ'] = hl.or_else(hl.gq_from_pl(pl), old_entry.GQ) dropped_fields.append('LPL') return hl.cond(hl.len(ds.alleles) == 1, old_entry.annotate(**{f[1:]: old_entry[f] for f in ['LGT', 'LPGT', 'LAD', 'LPL'] if f in fields}).drop(*dropped_fields), old_entry.annotate(**new_exprs).drop(*dropped_fields)) if 'LPL' in fields: return hl.bind(with_pl, new_pl) else: return with_pl(None) lai = hl.fold(lambda accum, elt: hl.cond(old_entry.LA[elt] == ds[new_id].a_index, elt, accum), hl.null(hl.tint32), hl.range(0, hl.len(old_entry.LA))) return hl.bind(with_local_a_index, lai)
def call_auto(kid_pp, dad_pp, mom_pp, kid_ad_ratio): p_data_given_dn = dad_pp[0] * mom_pp[0] * kid_pp[1] * DE_NOVO_PRIOR p_het_in_parent = 1 - (1 - prior) ** 4 p_data_given_missed_het = (dad_pp[1] * mom_pp[0] + dad_pp[0] * mom_pp[1]) * kid_pp[1] * p_het_in_parent p_de_novo = p_data_given_dn / (p_data_given_dn + p_data_given_missed_het) def solve(p_de_novo): return ( hl.case() .when(kid.GQ < min_gq, failure) .when((kid.DP / (dad.DP + mom.DP) < min_dp_ratio) | ~(kid_ad_ratio >= min_child_ab), failure) .when((hl.sum(mom.AD) == 0) | (hl.sum(dad.AD) == 0), failure) .when((mom.AD[1] / hl.sum(mom.AD) > max_parent_ab) | (dad.AD[1] / hl.sum(dad.AD) > max_parent_ab), failure) .when(p_de_novo < min_p, failure) .when(~is_snp, hl.case() .when((p_de_novo > 0.99) & (kid_ad_ratio > 0.3) & (n_alt_alleles == 1), hl.struct(p_de_novo=p_de_novo, confidence='HIGH')) .when((p_de_novo > 0.5) & (kid_ad_ratio > 0.3) & (n_alt_alleles <= 5), hl.struct(p_de_novo=p_de_novo, confidence='MEDIUM')) .when((p_de_novo > 0.05) & (kid_ad_ratio > 0.2), hl.struct(p_de_novo=p_de_novo, confidence='LOW')) .or_missing()) .default(hl.case() .when(((p_de_novo > 0.99) & (kid_ad_ratio > 0.3) & (dp_ratio > 0.2)) | ((p_de_novo > 0.99) & (kid_ad_ratio > 0.3) & (n_alt_alleles == 1)) | ((p_de_novo > 0.5) & (kid_ad_ratio > 0.3) & (n_alt_alleles < 10) & (kid.DP > 10)), hl.struct(p_de_novo=p_de_novo, confidence='HIGH')) .when((p_de_novo > 0.5) & ((kid_ad_ratio > 0.3) | (n_alt_alleles == 1)), hl.struct(p_de_novo=p_de_novo, confidence='MEDIUM')) .when((p_de_novo > 0.05) & (kid_ad_ratio > 0.2), hl.struct(p_de_novo=p_de_novo, confidence='LOW')) .or_missing() ) ) return hl.bind(solve, p_de_novo)
def transform_one(mt: MatrixTable) -> MatrixTable: """transforms a gvcf into a form suitable for combining""" mt = mt.annotate_entries( # local (alt) allele index into global (alt) alleles LA=hl.range(0, hl.len(mt.alleles) - 1), END=mt.info.END, PL=mt['PL'][0:], BaseQRankSum=mt.info['BaseQRankSum'], ClippingRankSum=mt.info['ClippingRankSum'], MQ=mt.info['MQ'], MQRankSum=mt.info['MQRankSum'], ReadPosRankSum=mt.info['ReadPosRankSum'], ) # This collects all fields with median combiners into arrays so we can calculate medians # when needed mt = mt.annotate_rows( # now minrep'ed (ref, alt) allele pairs alleles=hl.bind(lambda ref: mt.alleles[1:].map(lambda alt: # minrep <NON_REF> hl.struct(ref=hl.cond(alt == "<NON_REF>", ref[0:1], ref), alt=alt)), mt.alleles[0]), info=mt.info.annotate( SB=hl.agg.array_sum(mt.entry.SB) ).select( "DP", "MQ_DP", "QUALapprox", "RAW_MQ", "VarDP", "SB", )) mt = mt.drop('SB', 'qual') return mt
def de_novo(mt: MatrixTable, pedigree: Pedigree, pop_frequency_prior, *, min_gq: int = 20, min_p: float = 0.05, max_parent_ab: float = 0.05, min_child_ab: float = 0.20, min_dp_ratio: float = 0.10) -> Table: r"""Call putative *de novo* events from trio data. .. include:: ../_templates/req_tstring.rst .. include:: ../_templates/req_tvariant.rst .. include:: ../_templates/req_biallelic.rst Examples -------- Call de novo events: >>> pedigree = hl.Pedigree.read('data/trios.fam') >>> priors = hl.import_table('data/gnomadFreq.tsv', impute=True) >>> priors = priors.transmute(**hl.parse_variant(priors.Variant)).key_by('locus', 'alleles') >>> de_novo_results = hl.de_novo(dataset, pedigree, pop_frequency_prior=priors[dataset.row_key].AF) Notes ----- This method assumes the GATK high-throughput sequencing fields exist: `GT`, `AD`, `DP`, `GQ`, `PL`. This method replicates the functionality of `Kaitlin Samocha's de novo caller <https://github.com/ksamocha/de_novo_scripts>`__. The version corresponding to git commit ``bde3e40`` is implemented in Hail with her permission and assistance. This method produces a :class:`.Table` with the following fields: - `locus` (``locus``) -- Variant locus. - `alleles` (``array<str>``) -- Variant alleles. - `id` (``str``) -- Proband sample ID. - `prior` (``float64``) -- Site frequency prior. It is the maximum of: the computed dataset alternate allele frequency, the `pop_frequency_prior` parameter, and the global prior ``1 / 3e7``. - `proband` (``struct``) -- Proband column fields from `mt`. - `father` (``struct``) -- Father column fields from `mt`. - `mother` (``struct``) -- Mother column fields from `mt`. - `proband_entry` (``struct``) -- Proband entry fields from `mt`. - `father_entry` (``struct``) -- Father entry fields from `mt`. - `proband_entry` (``struct``) -- Mother entry fields from `mt`. - `is_female` (``bool``) -- ``True`` if proband is female. - `p_de_novo` (``float64``) -- Unfiltered posterior probability that the event is *de novo* rather than a missed heterozygous event in a parent. - `confidence` (``str``) Validation confidence. One of: ``'HIGH'``, ``'MEDIUM'``, ``'LOW'``. The key of the table is ``['locus', 'alleles', 'id']``. The model looks for de novo events in which both parents are homozygous reference and the proband is a heterozygous. The model makes the simplifying assumption that when this configuration ``x = (AA, AA, AB)`` of calls occurs, exactly one of the following is true: - ``d``: a de novo mutation occurred in the proband and all calls are accurate. - ``m``: at least one parental allele is actually heterozygous and the proband call is accurate. We can then estimate the posterior probability of a de novo mutation as: .. math:: \mathrm{P_{\text{de novo}}} = \frac{\mathrm{P}(d\,|\,x)}{\mathrm{P}(d\,|\,x) + \mathrm{P}(m\,|\,x)} Applying Bayes rule to the numerator and denominator yields .. math:: \frac{\mathrm{P}(x\,|\,d)\,\mathrm{P}(d)}{\mathrm{P}(x\,|\,d)\,\mathrm{P}(d) + \mathrm{P}(x\,|\,m)\,\mathrm{P}(m)} The prior on de novo mutation is estimated from the rate in the literature: .. math:: \mathrm{P}(d) = \frac{1 \text{mutation}}{30,000,000\, \text{bases}} The prior used for at least one alternate allele between the parents depends on the alternate allele frequency: .. math:: \mathrm{P}(m) = 1 - (1 - AF)^4 The likelihoods :math:`\mathrm{P}(x\,|\,d)` and :math:`\mathrm{P}(x\,|\,m)` are computed from the PL (genotype likelihood) fields using these factorizations: .. math:: \mathrm{P}(x = (AA, AA, AB) \,|\,d) = \Big( &\mathrm{P}(x_{\mathrm{father}} = AA \,|\, \mathrm{father} = AA) \\ \cdot &\mathrm{P}(x_{\mathrm{mother}} = AA \,|\, \mathrm{mother} = AA) \\ \cdot &\mathrm{P}(x_{\mathrm{proband}} = AB \,|\, \mathrm{proband} = AB) \Big) .. math:: \mathrm{P}(x = (AA, AA, AB) \,|\,m) = \Big( & \mathrm{P}(x_{\mathrm{father}} = AA \,|\, \mathrm{father} = AB) \cdot \mathrm{P}(x_{\mathrm{mother}} = AA \,|\, \mathrm{mother} = AA) \\ + \, &\mathrm{P}(x_{\mathrm{father}} = AA \,|\, \mathrm{father} = AA) \cdot \mathrm{P}(x_{\mathrm{mother}} = AA \,|\, \mathrm{mother} = AB) \Big) \\ \cdot \, &\mathrm{P}(x_{\mathrm{proband}} = AB \,|\, \mathrm{proband} = AB) (Technically, the second factorization assumes there is exactly (rather than at least) one alternate allele among the parents, which may be justified on the grounds that it is typically the most likely case by far.) While this posterior probability is a good metric for grouping putative de novo mutations by validation likelihood, there exist error modes in high-throughput sequencing data that are not appropriately accounted for by the phred-scaled genotype likelihoods. To this end, a number of hard filters are applied in order to assign validation likelihood. These filters are different for SNPs and insertions/deletions. In the below rules, the following variables are used: - ``DR`` refers to the ratio of the read depth in the proband to the combined read depth in the parents. - ``AB`` refers to the read allele balance of the proband (number of alternate reads divided by total reads). - ``AC`` refers to the count of alternate alleles across all individuals in the dataset at the site. - ``p`` refers to :math:`\mathrm{P_{\text{de novo}}}`. - ``min_p`` refers to the ``min_p`` function parameter. HIGH-quality SNV: .. code-block:: text p > 0.99 && AB > 0.3 && DR > 0.2 or p > 0.99 && AB > 0.3 && AC == 1 MEDIUM-quality SNV: .. code-block:: text p > 0.5 && AB > 0.3 or p > 0.5 && AB > 0.2 && AC == 1 LOW-quality SNV: .. code-block:: text p > min_p && AB > 0.2 HIGH-quality indel: .. code-block:: text p > 0.99 && AB > 0.3 && DR > 0.2 or p > 0.99 && AB > 0.3 && AC == 1 MEDIUM-quality indel: .. code-block:: text p > 0.5 && AB > 0.3 or p > 0.5 && AB > 0.2 and AC == 1 LOW-quality indel: .. code-block:: text p > min_p && AB > 0.2 Additionally, de novo candidates are not considered if the proband GQ is smaller than the ``min_gq`` parameter, if the proband allele balance is lower than the ``min_child_ab`` parameter, if the depth ratio between the proband and parents is smaller than the ``min_depth_ratio`` parameter, or if the allele balance in a parent is above the ``max_parent_ab`` parameter. Parameters ---------- mt : :class:`.MatrixTable` High-throughput sequencing dataset. pedigree : :class:`.Pedigree` Sample pedigree. pop_frequency_prior : :class:`.Float64Expression` Expression for population alternate allele frequency prior. min_gq Minimum proband GQ to be considered for *de novo* calling. min_p Minimum posterior probability to be considered for *de novo* calling. max_parent_ab Maximum parent allele balance. min_child_ab Minimum proband allele balance/ min_dp_ratio Minimum ratio between proband read depth and parental read depth. Returns ------- :class:`.Table` """ DE_NOVO_PRIOR = 1 / 30000000 MIN_POP_PRIOR = 100 / 30000000 required_entry_fields = {'GT', 'AD', 'DP', 'GQ', 'PL'} missing_fields = required_entry_fields - set(mt.entry) if missing_fields: raise ValueError(f"'de_novo': expected 'MatrixTable' to have at least {required_entry_fields}, " f"missing {missing_fields}") mt = mt.annotate_rows(__prior=pop_frequency_prior, __alt_alleles=hl.agg.sum(mt.GT.n_alt_alleles()), __total_alleles=2 * hl.agg.sum(hl.is_defined(mt.GT))) # subtract 1 from __alt_alleles to correct for the observed genotype mt = mt.annotate_rows(__site_freq=hl.max((mt.__alt_alleles - 1) / mt.__total_alleles, mt.__prior, MIN_POP_PRIOR)) mt = require_biallelic(mt, 'de_novo') # FIXME check that __site_freq is between 0 and 1 when possible in expr tm = trio_matrix(mt, pedigree, complete_trios=True) autosomal = tm.locus.in_autosome_or_par() | (tm.locus.in_x_nonpar() & tm.is_female) hemi_x = tm.locus.in_x_nonpar() & ~tm.is_female hemi_y = tm.locus.in_y_nonpar() & ~tm.is_female hemi_mt = tm.locus.in_mito() & tm.is_female is_snp = hl.is_snp(tm.alleles[0], tm.alleles[1]) n_alt_alleles = tm.__alt_alleles prior = tm.__site_freq het_hom_hom = tm.proband_entry.GT.is_het() & tm.father_entry.GT.is_hom_ref() & tm.mother_entry.GT.is_hom_ref() kid_ad_fail = tm.proband_entry.AD[1] / hl.sum(tm.proband_entry.AD) < min_child_ab failure = hl.null(hl.tstruct(p_de_novo=hl.tfloat64, confidence=hl.tstr)) kid = tm.proband_entry dad = tm.father_entry mom = tm.mother_entry kid_linear_pl = 10 ** (-kid.PL / 10) kid_pp = hl.bind(lambda x: x / hl.sum(x), kid_linear_pl) dad_linear_pl = 10 ** (-dad.PL / 10) dad_pp = hl.bind(lambda x: x / hl.sum(x), dad_linear_pl) mom_linear_pl = 10 ** (-mom.PL / 10) mom_pp = hl.bind(lambda x: x / hl.sum(x), mom_linear_pl) kid_ad_ratio = kid.AD[1] / hl.sum(kid.AD) dp_ratio = kid.DP / (dad.DP + mom.DP) def call_auto(kid_pp, dad_pp, mom_pp, kid_ad_ratio): p_data_given_dn = dad_pp[0] * mom_pp[0] * kid_pp[1] * DE_NOVO_PRIOR p_het_in_parent = 1 - (1 - prior) ** 4 p_data_given_missed_het = (dad_pp[1] * mom_pp[0] + dad_pp[0] * mom_pp[1]) * kid_pp[1] * p_het_in_parent p_de_novo = p_data_given_dn / (p_data_given_dn + p_data_given_missed_het) def solve(p_de_novo): return ( hl.case() .when(kid.GQ < min_gq, failure) .when((kid.DP / (dad.DP + mom.DP) < min_dp_ratio) | ~(kid_ad_ratio >= min_child_ab), failure) .when((hl.sum(mom.AD) == 0) | (hl.sum(dad.AD) == 0), failure) .when((mom.AD[1] / hl.sum(mom.AD) > max_parent_ab) | (dad.AD[1] / hl.sum(dad.AD) > max_parent_ab), failure) .when(p_de_novo < min_p, failure) .when(~is_snp, hl.case() .when((p_de_novo > 0.99) & (kid_ad_ratio > 0.3) & (n_alt_alleles == 1), hl.struct(p_de_novo=p_de_novo, confidence='HIGH')) .when((p_de_novo > 0.5) & (kid_ad_ratio > 0.3) & (n_alt_alleles <= 5), hl.struct(p_de_novo=p_de_novo, confidence='MEDIUM')) .when((p_de_novo > 0.05) & (kid_ad_ratio > 0.2), hl.struct(p_de_novo=p_de_novo, confidence='LOW')) .or_missing()) .default(hl.case() .when(((p_de_novo > 0.99) & (kid_ad_ratio > 0.3) & (dp_ratio > 0.2)) | ((p_de_novo > 0.99) & (kid_ad_ratio > 0.3) & (n_alt_alleles == 1)) | ((p_de_novo > 0.5) & (kid_ad_ratio > 0.3) & (n_alt_alleles < 10) & (kid.DP > 10)), hl.struct(p_de_novo=p_de_novo, confidence='HIGH')) .when((p_de_novo > 0.5) & ((kid_ad_ratio > 0.3) | (n_alt_alleles == 1)), hl.struct(p_de_novo=p_de_novo, confidence='MEDIUM')) .when((p_de_novo > 0.05) & (kid_ad_ratio > 0.2), hl.struct(p_de_novo=p_de_novo, confidence='LOW')) .or_missing() ) ) return hl.bind(solve, p_de_novo) def call_hemi(kid_pp, parent, parent_pp, kid_ad_ratio): p_data_given_dn = parent_pp[0] * kid_pp[1] * DE_NOVO_PRIOR p_het_in_parent = 1 - (1 - prior) ** 4 p_data_given_missed_het = (parent_pp[1] + parent_pp[2]) * kid_pp[2] * p_het_in_parent p_de_novo = p_data_given_dn / (p_data_given_dn + p_data_given_missed_het) def solve(p_de_novo): return ( hl.case() .when(kid.GQ < min_gq, failure) .when((kid.DP / (parent.DP) < min_dp_ratio) | (kid_ad_ratio < min_child_ab), failure) .when((hl.sum(parent.AD) == 0), failure) .when(parent.AD[1] / hl.sum(parent.AD) > max_parent_ab, failure) .when(p_de_novo < min_p, failure) .when(~is_snp, hl.case() .when((p_de_novo > 0.99) & (kid_ad_ratio > 0.3) & (n_alt_alleles == 1), hl.struct(p_de_novo=p_de_novo, confidence='HIGH')) .when((p_de_novo > 0.5) & (kid_ad_ratio > 0.3) & (n_alt_alleles <= 5), hl.struct(p_de_novo=p_de_novo, confidence='MEDIUM')) .when((p_de_novo > 0.05) & (kid_ad_ratio > 0.3), hl.struct(p_de_novo=p_de_novo, confidence='LOW')) .or_missing()) .default(hl.case() .when(((p_de_novo > 0.99) & (kid_ad_ratio > 0.3) & (dp_ratio > 0.2)) | ((p_de_novo > 0.99) & (kid_ad_ratio > 0.3) & (n_alt_alleles == 1)) | ((p_de_novo > 0.5) & (kid_ad_ratio > 0.3) & (n_alt_alleles < 10) & (kid.DP > 10)), hl.struct(p_de_novo=p_de_novo, confidence='HIGH')) .when((p_de_novo > 0.5) & ((kid_ad_ratio > 0.3) | (n_alt_alleles == 1)), hl.struct(p_de_novo=p_de_novo, confidence='MEDIUM')) .when((p_de_novo > 0.05) & (kid_ad_ratio > 0.2), hl.struct(p_de_novo=p_de_novo, confidence='LOW')) .or_missing() ) ) return hl.bind(solve, p_de_novo) de_novo_call = ( hl.case() .when(~het_hom_hom | kid_ad_fail, failure) .when(autosomal, hl.bind(call_auto, kid_pp, dad_pp, mom_pp, kid_ad_ratio)) .when(hemi_x | hemi_mt, hl.bind(call_hemi, kid_pp, mom, mom_pp, kid_ad_ratio)) .when(hemi_y, hl.bind(call_hemi, kid_pp, dad, dad_pp, kid_ad_ratio)) .or_missing() ) tm = tm.annotate_entries(__call=de_novo_call) tm = tm.filter_entries(hl.is_defined(tm.__call)) entries = tm.entries() return (entries.select('__site_freq', 'proband', 'father', 'mother', 'proband_entry', 'father_entry', 'mother_entry', 'is_female', **entries.__call) .rename({'__site_freq': 'prior'}))
def transform_entries(old_entry): def with_local_a_index(local_a_index): fields = set(old_entry.keys()) def with_pl(pl): new_exprs = {} dropped_fields = ['LA'] if 'LGT' in fields: new_exprs['GT'] = hl.rbind( old_entry.LGT, lambda lgt: hl.if_else( lgt.is_non_ref(), hl.downcode( lgt, hl.or_else(local_a_index, hl.len(old_entry.LA)) ), lgt)) dropped_fields.append('LGT') if 'LPGT' in fields: new_exprs['PGT'] = hl.rbind( old_entry.LPGT, lambda lpgt: hl.if_else( lpgt.is_non_ref(), hl.downcode( lpgt, hl.or_else(local_a_index, hl.len(old_entry.LA)) ), lpgt)) dropped_fields.append('LPGT') if 'LAD' in fields: non_ref_ad = hl.or_else(old_entry.LAD[local_a_index], 0) # zeroed if not in LAD new_exprs['AD'] = hl.or_missing( hl.is_defined(old_entry.LAD), [hl.sum(old_entry.LAD) - non_ref_ad, non_ref_ad]) dropped_fields.append('LAD') if 'LPL' in fields: new_exprs['PL'] = pl if 'GQ' in fields: new_exprs['GQ'] = hl.or_else(hl.gq_from_pl(pl), old_entry.GQ) dropped_fields.append('LPL') return (hl.case().when( hl.len(ds.alleles) == 1, old_entry.annotate( **{ f[1:]: old_entry[f] for f in ['LGT', 'LPGT', 'LAD', 'LPL'] if f in fields }).drop(*dropped_fields)).when( hl.or_else(old_entry.LGT.is_hom_ref(), False), old_entry.annotate( **{ f: old_entry[f'L{f}'] if f in ['GT', 'PGT'] else e for f, e in new_exprs.items() }).drop(*dropped_fields)).default( old_entry.annotate(**new_exprs).drop( *dropped_fields))) if 'LPL' in fields: new_pl = hl.or_missing( hl.is_defined(old_entry.LPL), hl.or_missing( hl.is_defined(local_a_index), hl.range(0, 3).map(lambda i: hl.min( hl.range(0, hl.triangle(hl.len(old_entry.LA))). filter(lambda j: hl.downcode( hl.unphased_diploid_gt_index_call(j), local_a_index) == hl. unphased_diploid_gt_index_call(i)).map( lambda idx: old_entry.LPL[idx]))))) return hl.bind(with_pl, new_pl) else: return with_pl(None) lai = hl.fold( lambda accum, elt: hl.if_else( old_entry.LA[elt] == ds[new_id].a_index, elt, accum), hl.missing(hl.tint32), hl.range(0, hl.len(old_entry.LA))) return hl.bind(with_local_a_index, lai)
def split_position_end(position): return hl.or_missing( hl.is_defined(position), hl.bind(lambda start: hl.if_else(start == "?", hl.null(hl.tint), hl.int(start)), position.split("-")[-1]), )
def import_exac_vcf(path): ds = hl.import_vcf(path, force_bgz=True, skip_invalid_loci=True).rows() ds = hl.split_multi(ds) ds = ds.repartition(5000, shuffle=True) # Get value corresponding to the split variant ds = ds.annotate( info=ds.info.annotate( **{ field: hl.or_missing(hl.is_defined(ds.info[field]), ds.info[field][ds.a_index - 1]) for field in PER_ALLELE_FIELDS } ) ) # For DP_HIST and GQ_HIST, the first value in the array contains the histogram for all individuals, # which is the same in each alt allele's variant. ds = ds.annotate( info=ds.info.annotate( DP_HIST=hl.struct(all=ds.info.DP_HIST[0], alt=ds.info.DP_HIST[ds.a_index]), GQ_HIST=hl.struct(all=ds.info.GQ_HIST[0], alt=ds.info.GQ_HIST[ds.a_index]), ) ) ds = ds.cache() # Convert "NA" and empty strings into null values # Convert fields in chunks to avoid "Method code too large" errors for i in range(0, len(SELECT_INFO_FIELDS), 10): ds = ds.annotate( info=ds.info.annotate( **{ field: hl.or_missing( hl.is_defined(ds.info[field]), hl.if_else( (hl.str(ds.info[field]) == "") | (hl.str(ds.info[field]) == "NA"), hl.null(ds.info[field].dtype), ds.info[field], ), ) for field in SELECT_INFO_FIELDS[i : i + 10] } ) ) # Convert field types ds = ds.annotate( info=ds.info.annotate( **{ field: hl.if_else(ds.info[field] == "", hl.null(hl.tint), hl.int(ds.info[field])) for field in CONVERT_TO_INT_FIELDS } ) ) ds = ds.annotate( info=ds.info.annotate( **{ field: hl.if_else(ds.info[field] == "", hl.null(hl.tfloat), hl.float(ds.info[field])) for field in CONVERT_TO_FLOAT_FIELDS } ) ) # Format VEP annotations to mimic the output of hail.vep ds = ds.annotate( info=ds.info.annotate( CSQ=ds.info.CSQ.map( lambda s: s.replace("%3A", ":") .replace("%3B", ";") .replace("%3D", "=") .replace("%25", "%") .replace("%2C", ",") ) ) ) ds = ds.annotate( vep=hl.struct( transcript_consequences=ds.info.CSQ.map( lambda csq_str: hl.bind( lambda csq_values: hl.struct( **{ field: hl.if_else(csq_values[index] == "", hl.null(hl.tstr), csq_values[index]) for index, field in enumerate(VEP_FIELDS) } ), csq_str.split(r"\|"), ) ) .filter(lambda annotation: annotation.Feature.startswith("ENST")) .filter(lambda annotation: hl.int(annotation.ALLELE_NUM) == ds.a_index) .map( lambda annotation: annotation.select( amino_acids=annotation.Amino_acids, biotype=annotation.BIOTYPE, canonical=annotation.CANONICAL == "YES", # cDNA_position may contain either "start-end" or, when start == end, "start" cdna_start=split_position_start(annotation.cDNA_position), cdna_end=split_position_end(annotation.cDNA_position), codons=annotation.Codons, consequence_terms=annotation.Consequence.split("&"), distance=hl.int(annotation.DISTANCE), domains=hl.or_missing( hl.is_defined(annotation.DOMAINS), annotation.DOMAINS.split("&").map( lambda d: hl.struct(db=d.split(":")[0], name=d.split(":")[1]) ), ), exon=annotation.EXON, gene_id=annotation.Gene, gene_symbol=annotation.SYMBOL, gene_symbol_source=annotation.SYMBOL_SOURCE, hgnc_id=annotation.HGNC_ID, hgvsc=annotation.HGVSc, hgvsp=annotation.HGVSp, lof=annotation.LoF, lof_filter=annotation.LoF_filter, lof_flags=annotation.LoF_flags, lof_info=annotation.LoF_info, # PolyPhen field contains "polyphen_prediction(polyphen_score)" polyphen_prediction=hl.or_missing( hl.is_defined(annotation.PolyPhen), annotation.PolyPhen.split(r"\(")[0] ), protein_id=annotation.ENSP, # Protein_position may contain either "start-end" or, when start == end, "start" protein_start=split_position_start(annotation.Protein_position), protein_end=split_position_end(annotation.Protein_position), # SIFT field contains "sift_prediction(sift_score)" sift_prediction=hl.or_missing(hl.is_defined(annotation.SIFT), annotation.SIFT.split(r"\(")[0]), transcript_id=annotation.Feature, ) ) ) ) ds = ds.annotate( vep=ds.vep.annotate( most_severe_consequence=hl.bind( lambda all_consequence_terms: hl.or_missing( all_consequence_terms.size() != 0, hl.sorted(all_consequence_terms, key=consequence_term_rank)[0] ), ds.vep.transcript_consequences.flatmap(lambda c: c.consequence_terms), ) ) ) ds = ds.cache() QUALITY_METRIC_HISTOGRAM_BIN_EDGES = [i * 5 for i in range(21)] ds = ds.select( variant_id=variant_id(ds.locus, ds.alleles), reference_genome="GRCh37", chrom=normalized_contig(ds.locus.contig), pos=ds.locus.position, xpos=x_position(ds.locus), ref=ds.alleles[0], alt=ds.alleles[1], rsid=ds.rsid, exome=hl.struct( ac=ds.info.AC_Adj, an=ds.info.AN_Adj, homozygote_count=ds.info.AC_Hom, hemizygote_count=hl.or_else(ds.info.AC_Hemi, 0), filters=hl.set(hl.if_else(ds.info.AC_Adj == 0, ds.filters.add("AC0"), ds.filters)), populations=[ hl.struct( id=pop_id, ac=ds.info[f"AC_{pop_id}"], an=ds.info[f"AN_{pop_id}"], hemizygote_count=hl.or_else(ds.info[f"Hemi_{pop_id}"], 0), homozygote_count=ds.info[f"Hom_{pop_id}"], ) for pop_id in ["AFR", "AMR", "EAS", "FIN", "NFE", "OTH", "SAS"] ], age_distribution=hl.struct( het=hl.rbind( hl.or_else(ds.info.AGE_HISTOGRAM_HET, "0|0|0|0|0|0|0|0|0|0|0|0").split(r"\|").map(hl.float), lambda bins: hl.struct( bin_edges=[30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80], bin_freq=bins[1:11], n_smaller=bins[0], n_larger=bins[11], ), ), hom=hl.rbind( hl.or_else(ds.info.AGE_HISTOGRAM_HOM, "0|0|0|0|0|0|0|0|0|0|0|0").split(r"\|").map(hl.float), lambda bins: hl.struct( bin_edges=[30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80], bin_freq=bins[1:11], n_smaller=bins[0], n_larger=bins[11], ), ), ), quality_metrics=hl.struct( genotype_depth=hl.struct( all=hl.struct( bin_edges=QUALITY_METRIC_HISTOGRAM_BIN_EDGES, bin_freq=ds.info.DP_HIST.all.split(r"\|").map(hl.float), ), alt=hl.struct( bin_edges=QUALITY_METRIC_HISTOGRAM_BIN_EDGES, bin_freq=ds.info.DP_HIST.alt.split(r"\|").map(hl.float), ), ), genotype_quality=hl.struct( all=hl.struct( bin_edges=QUALITY_METRIC_HISTOGRAM_BIN_EDGES, bin_freq=ds.info.GQ_HIST.all.split(r"\|").map(hl.float), ), alt=hl.struct( bin_edges=QUALITY_METRIC_HISTOGRAM_BIN_EDGES, bin_freq=ds.info.GQ_HIST.alt.split(r"\|").map(hl.float), ), ), site_quality_metrics=[ hl.struct(metric="BaseQRankSum", value=hl.float(ds.info.BaseQRankSum)), hl.struct(metric="ClippingRankSum", value=hl.float(ds.info.ClippingRankSum)), hl.struct(metric="DP", value=hl.float(ds.info.DP)), hl.struct(metric="FS", value=hl.float(ds.info.FS)), hl.struct(metric="InbreedingCoeff", value=hl.float(ds.info.InbreedingCoeff)), hl.struct(metric="MQ", value=hl.float(ds.info.MQ)), hl.struct(metric="MQRankSum", value=hl.float(ds.info.MQRankSum)), hl.struct(metric="QD", value=hl.float(ds.info.QD)), hl.struct(metric="ReadPosRankSum", value=hl.float(ds.info.ReadPosRankSum)), hl.struct(metric="SiteQuality", value=hl.float(ds.qual)), hl.struct(metric="VQSLOD", value=hl.float(ds.info.VQSLOD)), ], ), ), colocated_variants=hl.rbind( variant_id(ds.locus, ds.alleles), lambda this_variant_id: variant_ids(ds.old_locus, ds.old_alleles).filter( lambda v_id: v_id != this_variant_id ), ), vep=ds.vep, ) ds = ds.annotate(genome=hl.null(ds.exome.dtype)) return ds
def trio_matrix(dataset, pedigree, complete_trios=False) -> MatrixTable: """Builds and returns a matrix where columns correspond to trios and entries contain genotypes for the trio. .. include:: ../_templates/req_tstring.rst Examples -------- Create a trio matrix: >>> pedigree = hl.Pedigree.read('data/case_control_study.fam') >>> trio_dataset = hl.trio_matrix(dataset, pedigree, complete_trios=True) Notes ----- This method builds a new matrix table with one column per trio. If `complete_trios` is ``True``, then only trios that satisfy :meth:`.Trio.is_complete` are included. In this new dataset, the column identifiers are the sample IDs of the trio probands. The column fields and entries of the matrix are changed in the following ways: The new column fields consist of three structs (`proband`, `father`, `mother`), a Boolean field, and a string field: - **proband** (:class:`.tstruct`) - Column fields on the proband. - **father** (:class:`.tstruct`) - Column fields on the father. - **mother** (:class:`.tstruct`) - Column fields on the mother. - **id** (:py:data:`.tstr`) - Column key for the proband. - **is_female** (:py:data:`.tbool`) - Proband is female. ``True`` for female, ``False`` for male, missing if unknown. - **fam_id** (:py:data:`.tstr`) - Family ID. The new entry fields are: - **proband_entry** (:class:`.tstruct`) - Proband entry fields. - **father_entry** (:class:`.tstruct`) - Father entry fields. - **mother_entry** (:class:`.tstruct`) - Mother entry fields. Parameters ---------- pedigree : :class:`.Pedigree` Returns ------- :class:`.MatrixTable` """ mt = dataset require_col_key_str(mt, "trio_matrix") k = mt.col_key.dtype.fields[0] samples = mt[k].collect() pedigree = pedigree.filter_to(samples) trios = pedigree.complete_trios() if complete_trios else pedigree.trios n_trios = len(trios) sample_idx = {} for i, s in enumerate(samples): sample_idx[s] = i trios = [hl.Struct( id=sample_idx[t.s], pat_id=None if t.pat_id is None else sample_idx[t.pat_id], mat_id=None if t.mat_id is None else sample_idx[t.mat_id], is_female=t.is_female, fam_id=t.fam_id) for t in trios] trios_type = hl.dtype('array<struct{id:int,pat_id:int,mat_id:int,is_female:bool,fam_id:str}>') trios_sym = Env.get_uid() entries_sym = Env.get_uid() cols_sym = Env.get_uid() mt = mt.annotate_globals(**{trios_sym: hl.literal(trios, trios_type)}) mt = mt._localize_entries(entries_sym, cols_sym) mt = mt.annotate_globals(**{ cols_sym: hl.map(lambda i: hl.bind(lambda t: hl.struct(id=mt[cols_sym][t.id][k], proband=mt[cols_sym][t.id], father=mt[cols_sym][t.pat_id], mother=mt[cols_sym][t.mat_id], is_female=t.is_female, fam_id=t.fam_id), mt[trios_sym][i]), hl.range(0, n_trios))}) mt = mt.annotate(**{ entries_sym: hl.map(lambda i: hl.bind(lambda t: hl.struct(proband_entry=mt[entries_sym][t.id], father_entry=mt[entries_sym][t.pat_id], mother_entry=mt[entries_sym][t.mat_id]), mt[trios_sym][i]), hl.range(0, n_trios))}) mt = mt.drop(trios_sym) return mt._unlocalize_entries(entries_sym, cols_sym, ['id'])
def get_expr_for_vep_sorted_transcript_consequences_array( vep_root, include_coding_annotations=True, omit_consequences=OMIT_CONSEQUENCE_TERMS): """Sort transcripts by 3 properties: 1. coding > non-coding 2. transcript consequence severity 3. canonical > non-canonical so that the 1st array entry will be for the coding, most-severe, canonical transcript (assuming one exists). Also, for each transcript in the array, computes these additional fields: domains: converts Array[Struct] to string of comma-separated domain names hgvs: set to hgvsp is it exists, or else hgvsc. formats hgvsp for synonymous variants. major_consequence: set to most severe consequence for that transcript ( VEP sometimes provides multiple consequences for a single transcript) major_consequence_rank: major_consequence rank based on VEP SO ontology (most severe = 1) (see http://www.ensembl.org/info/genome/variation/predicted_data.html) category: set to one of: "lof", "missense", "synonymous", "other" based on the value of major_consequence. Args: vep_root (StructExpression): root path of the VEP struct in the MT include_coding_annotations (bool): if True, fields relevant to protein-coding variants will be included """ selected_annotations = [ "biotype", "canonical", "cdna_start", "cdna_end", "codons", "gene_id", "gene_symbol", "hgvsc", "hgvsp", "transcript_id", ] if include_coding_annotations: selected_annotations.extend([ "amino_acids", "lof", "lof_filter", "lof_flags", "lof_info", "polyphen_prediction", "protein_id", "protein_start", "sift_prediction", ]) omit_consequence_terms = hl.set( omit_consequences) if omit_consequences else hl.empty_set(hl.tstr) result = hl.sorted( vep_root.transcript_consequences.map(lambda c: c.select( *selected_annotations, consequence_terms=c.consequence_terms.filter( lambda t: ~omit_consequence_terms.contains(t)), domains=c.domains.map(lambda domain: domain.db + ":" + domain.name ), major_consequence=hl.cond( c.consequence_terms.size() > 0, hl.sorted(c.consequence_terms, key=lambda t: CONSEQUENCE_TERM_RANK_LOOKUP.get(t))[0 ], hl.null(hl.tstr), ))).filter(lambda c: c.consequence_terms.size() > 0). map(lambda c: c.annotate( category=(hl.case().when( CONSEQUENCE_TERM_RANK_LOOKUP.get(c.major_consequence) <= CONSEQUENCE_TERM_RANK_LOOKUP.get("frameshift_variant"), "lof", ).when( CONSEQUENCE_TERM_RANK_LOOKUP.get(c.major_consequence) <= CONSEQUENCE_TERM_RANK_LOOKUP.get("missense_variant"), "missense", ).when( CONSEQUENCE_TERM_RANK_LOOKUP.get(c.major_consequence) <= CONSEQUENCE_TERM_RANK_LOOKUP.get("synonymous_variant"), "synonymous", ).default("other")), hgvs=get_expr_for_formatted_hgvs(c), major_consequence_rank=CONSEQUENCE_TERM_RANK_LOOKUP.get( c.major_consequence), )), lambda c: (hl.bind( lambda is_coding, is_most_severe, is_canonical: (hl.cond( is_coding, hl.cond(is_most_severe, hl.cond(is_canonical, 1, 2), hl.cond(is_canonical, 3, 4)), hl.cond(is_most_severe, hl.cond(is_canonical, 5, 6), hl.cond(is_canonical, 7, 8)), )), hl.or_else(c.biotype, "") == "protein_coding", hl.set(c.consequence_terms).contains(vep_root. most_severe_consequence), hl.or_else(c.canonical, 0) == 1, )), ) if not include_coding_annotations: # for non-coding variants, drop fields here that are hard to exclude in the above code result = result.map(lambda c: c.drop("domains", "hgvsp")) return hl.zip_with_index(result).map(lambda csq_with_index: csq_with_index[ 1].annotate(transcript_rank=csq_with_index[0]))
def variant_qc(mt, name='variant_qc') -> MatrixTable: """Compute common variant statistics (quality control metrics). .. include:: ../_templates/req_tvariant.rst Examples -------- >>> dataset_result = hl.variant_qc(dataset) Notes ----- This method computes variant statistics from the genotype data, returning a new struct field `name` with the following metrics based on the fields present in the entry schema. If `mt` contains an entry field `DP` of type :py:data:`.tint32`, then the field `dp_stats` is computed. If `mt` contains an entry field `GQ` of type :py:data:`.tint32`, then the field `gq_stats` is computed. Both `dp_stats` and `gq_stats` are structs with with four fields: - `mean` (``float64``) -- Mean value. - `stdev` (``float64``) -- Standard deviation (zero degrees of freedom). - `min` (``int32``) -- Minimum value. - `max` (``int32``) -- Maximum value. If the dataset does not contain an entry field `GT` of type :py:data:`.tcall`, then an error is raised. The following fields are always computed from `GT`: - `AF` (``array<float64>``) -- Calculated allele frequency, one element per allele, including the reference. Sums to one. Equivalent to `AC` / `AN`. - `AC` (``array<int32>``) -- Calculated allele count, one element per allele, including the reference. Sums to `AN`. - `AN` (``int32``) -- Total number of called alleles. - `homozygote_count` (``array<int32>``) -- Number of homozygotes per allele. One element per allele, including the reference. - `n_called` (``int64``) -- Number of samples with a defined `GT`. - `n_not_called` (``int64``) -- Number of samples with a missing `GT`. - `call_rate` (``float32``) -- Fraction of samples with a defined `GT`. Equivalent to `n_called` / :meth:`.count_cols`. - `n_het` (``int64``) -- Number of heterozygous samples. - `n_non_ref` (``int64``) -- Number of samples with at least one called non-reference allele. - `het_freq_hwe` (``float64``) -- Expected frequency of heterozygous samples under Hardy-Weinberg equilibrium. See :func:`.functions.hardy_weinberg_test` for details. - `p_value_hwe` (``float64``) -- p-value from test of Hardy-Weinberg equilibrium. See :func:`.functions.hardy_weinberg_test` for details. Warning ------- `het_freq_hwe` and `p_value_hwe` are calculated as in :func:`.functions.hardy_weinberg_test`, with non-diploid calls (``ploidy != 2``) ignored in the counts. As this test is only statistically rigorous in the biallelic setting, :func:`.variant_qc` sets both fields to missing for multiallelic variants. Consider using :func:`~hail.methods.split_multi` to split multi-allelic variants beforehand. Parameters ---------- mt : :class:`.MatrixTable` Dataset. name : :obj:`str` Name for resulting field. Returns ------- :class:`.MatrixTable` """ require_row_key_variant(mt, 'variant_qc') exprs = {} struct_exprs = [] def has_field_of_type(name, dtype): return name in mt.entry and mt[name].dtype == dtype n_samples = mt.count_cols() if has_field_of_type('DP', hl.tint32): exprs['dp_stats'] = hl.agg.stats(mt.DP).select('mean', 'stdev', 'min', 'max') if has_field_of_type('GQ', hl.tint32): exprs['gq_stats'] = hl.agg.stats(mt.GQ).select('mean', 'stdev', 'min', 'max') if not has_field_of_type('GT', hl.tcall): raise ValueError( f"'variant_qc': expect an entry field 'GT' of type 'call'") exprs['n_called'] = hl.agg.count_where(hl.is_defined(mt['GT'])) struct_exprs.append(hl.agg.call_stats(mt.GT, mt.alleles)) # the structure of this function makes it easy to add new nested computations def flatten_struct(*struct_exprs): flat = {} for struct in struct_exprs: for k, v in struct.items(): flat[k] = v return hl.struct( **flat, **exprs, ) mt = mt.annotate_rows(**{name: hl.bind(flatten_struct, *struct_exprs)}) hwe = hl.hardy_weinberg_test( mt[name].homozygote_count[0], mt[name].AC[1] - 2 * mt[name].homozygote_count[1], mt[name].homozygote_count[1]) hwe = hwe.select(het_freq_hwe=hwe.het_freq_hwe, p_value_hwe=hwe.p_value) mt = mt.annotate_rows( **{ name: mt[name].annotate( n_not_called=n_samples - mt[name].n_called, call_rate=mt[name].n_called / n_samples, n_het=mt[name].n_called - hl.sum(mt[name].homozygote_count), n_non_ref=mt[name].n_called - mt[name].homozygote_count[0], **hl.cond(hl.len(mt.alleles) == 2, hwe, hl.null(hwe.dtype))) }) return mt
def histogram(data, range=None, bins=50, legend=None, title=None, log=False, interactive=False): """Create a histogram. Notes ----- `data` can be a :class:`.Float64Expression`, or the result of the :func:`.agg.hist` or :func:`.agg.approx_cdf` aggregators. Parameters ---------- data : :class:`.Struct` or :class:`.Float64Expression` Sequence of data to plot. range : Tuple[float] Range of x values in the histogram. bins : int Number of bins in the histogram. legend : str Label of data on the x-axis. title : str Title of the histogram. log : bool Plot the log10 of the bin counts. Returns ------- :class:`bokeh.plotting.figure.Figure` """ if isinstance(data, Expression): if data._indices.source is not None: if interactive: raise ValueError("'interactive' flag can only be used on data from 'approx_cdf'.") agg_f = data._aggregation_method() if range is not None: start = range[0] end = range[1] else: finite_data = hail.bind(lambda x: hail.case().when(hail.is_finite(x), x).or_missing(), data) start, end = agg_f((aggregators.min(finite_data), aggregators.max(finite_data))) if start is None and end is None: raise ValueError(f"'data' contains no values that are defined and finite") data = agg_f(aggregators.hist(data, start, end, bins)) else: return ValueError('Invalid input') elif 'values' in data: cdf = data hist, edges = np.histogram(cdf.values, bins=bins, weights=np.diff(cdf.ranks), density=True) data = Struct(bin_freq=hist, bin_edges=edges, n_larger=0, n_smaller=0) if log: data.bin_freq = [log10(x) for x in data.bin_freq] data.n_larger = log10(data.n_larger) data.n_smaller = log10(data.n_smaller) y_axis_label = 'log10 Frequency' else: y_axis_label = 'Frequency' x_span = data.bin_edges[-1] - data.bin_edges[0] x_start = data.bin_edges[0] - .05 * x_span x_end = data.bin_edges[-1] + .05 * x_span p = figure( title=title, x_axis_label=legend, y_axis_label=y_axis_label, background_fill_color='#EEEEEE', x_range=(x_start, x_end)) q = p.quad( bottom=0, top=data.bin_freq, left=data.bin_edges[:-1], right=data.bin_edges[1:], legend=legend, line_color='black') if data.n_larger > 0: p.quad( bottom=0, top=data.n_larger, left=data.bin_edges[-1], right=(data.bin_edges[-1] + (data.bin_edges[1] - data.bin_edges[0])), line_color='black', fill_color='green', legend='Outliers Above') if data.n_smaller > 0: p.quad( bottom=0, top=data.n_smaller, left=data.bin_edges[0] - (data.bin_edges[1] - data.bin_edges[0]), right=data.bin_edges[0], line_color='black', fill_color='red', legend='Outliers Below') if interactive: def mk_interact(handle): def update(bins=bins, phase=0): if phase > 0 and phase < 1: bins = bins + 1 delta = (cdf.values[-1] - cdf.values[0]) / bins edges = np.linspace(cdf.values[0] - (1 - phase) * delta, cdf.values[-1] + phase * delta, bins) else: edges = np.linspace(cdf.values[0], cdf.values[-1], bins) hist, edges = np.histogram(cdf.values, bins=edges, weights=np.diff(cdf.ranks), density=True) new_data = {'top': hist, 'left': edges[:-1], 'right': edges[1:], 'bottom': np.full(len(hist), 0)} q.data_source.data = new_data bokeh.io.push_notebook(handle) from ipywidgets import interact interact(update, bins=(0, 5*bins), phase=(0, 1, .01)) return p, mk_interact else: return p
def prepare_gnomad_v2_mnvs(mnvs_path, three_bp_mnvs_path): mnvs = import_mnv_file(mnvs_path, quote="'") mnvs_3bp = import_three_bp_mnv_file(three_bp_mnvs_path, quote="'") snp12_components = mnvs_3bp.select( component_mnv=hl.bind( lambda snv1, snv2: hl.delimit( [snv1.chrom, hl.str(snv1.pos), snv1.ref + snv2.ref, snv1.alt + snv2.alt,], "-", ), mnvs_3bp.constituent_snvs[0], mnvs_3bp.constituent_snvs[1], ), related_mnv=hl.struct( combined_variant_id=mnvs_3bp.variant_id, n_individuals=mnvs_3bp.n_individuals, other_constituent_snvs=[mnvs_3bp.constituent_snvs[2].variant_id], consequences=mnvs_3bp.consequences, ), ) snp23_components = mnvs_3bp.select( component_mnv=hl.bind( lambda snv2, snv3: hl.delimit( [snv2.chrom, hl.str(snv2.pos), snv2.ref + snv3.ref, snv2.alt + snv3.alt,], "-", ), mnvs_3bp.constituent_snvs[1], mnvs_3bp.constituent_snvs[2], ), related_mnv=hl.struct( combined_variant_id=mnvs_3bp.variant_id, n_individuals=mnvs_3bp.n_individuals, other_constituent_snvs=[mnvs_3bp.constituent_snvs[0].variant_id], consequences=mnvs_3bp.consequences, ), ) snp13_components = mnvs_3bp.select( component_mnv=hl.bind( lambda snv1, snv2, snv3: hl.delimit( [snv1.chrom, hl.str(snv1.pos), snv1.ref + snv2.ref + snv3.ref, snv1.alt + snv2.ref + snv3.alt,], "-", ), mnvs_3bp.constituent_snvs[0], mnvs_3bp.constituent_snvs[1], mnvs_3bp.constituent_snvs[2], ), related_mnv=hl.struct( combined_variant_id=mnvs_3bp.variant_id, n_individuals=mnvs_3bp.n_individuals, other_constituent_snvs=[mnvs_3bp.constituent_snvs[1].variant_id], consequences=mnvs_3bp.consequences, ), ) component_2bp_mnvs = snp12_components.union(snp13_components).union(snp23_components) component_2bp_mnvs = component_2bp_mnvs.group_by(component_2bp_mnvs.component_mnv).aggregate( related_mnvs=hl.agg.collect(component_2bp_mnvs.related_mnv) ) mnvs = mnvs.annotate(related_mnvs=component_2bp_mnvs[mnvs.variant_id].related_mnvs) mnvs = mnvs.annotate( related_mnvs=hl.or_else(mnvs.related_mnvs, hl.empty_array(mnvs.related_mnvs.dtype.element_type)) ) mnvs = mnvs.annotate( related_mnvs=mnvs.related_mnvs.map( lambda related_mnv: related_mnv.select( "combined_variant_id", "n_individuals", "other_constituent_snvs", changes_amino_acids=hl.bind( lambda mnv_consequences, related_mnv_consequences: mnv_consequences.key_set() .union(related_mnv_consequences.key_set()) .any(lambda gene_id: mnv_consequences.get(gene_id) != related_mnv_consequences.get(gene_id)), hl.dict(mnvs.consequences.map(lambda c: (c.gene_id, c.amino_acids.lower()))), hl.dict(related_mnv.consequences.map(lambda c: (c.gene_id, c.amino_acids.lower()))), ), ) ) ) mnvs_3bp = mnvs_3bp.annotate(related_mnvs=hl.empty_array(mnvs.related_mnvs.dtype.element_type)) mnvs = mnvs.union(mnvs_3bp) return mnvs
def variant_qc(mt, name='variant_qc') -> MatrixTable: """Compute common variant statistics (quality control metrics). .. include:: ../_templates/req_tvariant.rst Examples -------- >>> dataset_result = hl.variant_qc(dataset) Notes ----- This method computes variant statistics from the genotype data, returning a new struct field `name` with the following metrics based on the fields present in the entry schema. If `mt` contains an entry field `DP` of type :py:data:`.tint32`, then the field `dp_stats` is computed. If `mt` contains an entry field `GQ` of type :py:data:`.tint32`, then the field `gq_stats` is computed. Both `dp_stats` and `gq_stats` are structs with with four fields: - `mean` (``float64``) -- Mean value. - `stdev` (``float64``) -- Standard deviation (zero degrees of freedom). - `min` (``int32``) -- Minimum value. - `max` (``int32``) -- Maximum value. If the dataset does not contain an entry field `GT` of type :py:data:`.tcall`, then an error is raised. The following fields are always computed from `GT`: - `AF` (``array<float64>``) -- Calculated allele frequency, one element per allele, including the reference. Sums to one. Equivalent to `AC` / `AN`. - `AC` (``array<int32>``) -- Calculated allele count, one element per allele, including the reference. Sums to `AN`. - `AN` (``int32``) -- Total number of called alleles. - `homozygote_count` (``array<int32>``) -- Number of homozygotes per allele. One element per allele, including the reference. - `n_called` (``int64``) -- Number of samples with a defined `GT`. - `n_not_called` (``int64``) -- Number of samples with a missing `GT`. - `call_rate` (``float32``) -- Fraction of samples with a defined `GT`. Equivalent to `n_called` / :meth:`.count_cols`. - `n_het` (``int64``) -- Number of heterozygous samples. - `n_non_ref` (``int64``) -- Number of samples with at least one called non-reference allele. - `het_freq_hwe` (``float64``) -- Expected frequency of heterozygous samples under Hardy-Weinberg equilibrium. See :func:`.functions.hardy_weinberg_test` for details. - `p_value_hwe` (``float64``) -- p-value from test of Hardy-Weinberg equilibrium. See :func:`.functions.hardy_weinberg_test` for details. Warning ------- `het_freq_hwe` and `p_value_hwe` are calculated as in :func:`.functions.hardy_weinberg_test`, with non-diploid calls (``ploidy != 2``) ignored in the counts. As this test is only statistically rigorous in the biallelic setting, :func:`.variant_qc` sets both fields to missing for multiallelic variants. Consider using :func:`~hail.methods.split_multi` to split multi-allelic variants beforehand. Parameters ---------- mt : :class:`.MatrixTable` Dataset. name : :obj:`str` Name for resulting field. Returns ------- :class:`.MatrixTable` """ require_row_key_variant(mt, 'variant_qc') exprs = {} struct_exprs = [] def has_field_of_type(name, dtype): return name in mt.entry and mt[name].dtype == dtype n_samples = mt.count_cols() if has_field_of_type('DP', hl.tint32): exprs['dp_stats'] = hl.agg.stats(mt.DP).select('mean', 'stdev', 'min', 'max') if has_field_of_type('GQ', hl.tint32): exprs['gq_stats'] = hl.agg.stats(mt.GQ).select('mean', 'stdev', 'min', 'max') if not has_field_of_type('GT', hl.tcall): raise ValueError(f"'variant_qc': expect an entry field 'GT' of type 'call'") exprs['n_called'] = hl.agg.count_where(hl.is_defined(mt['GT'])) struct_exprs.append(hl.agg.call_stats(mt.GT, mt.alleles)) # the structure of this function makes it easy to add new nested computations def flatten_struct(*struct_exprs): flat = {} for struct in struct_exprs: for k, v in struct.items(): flat[k] = v return hl.struct( **flat, **exprs, ) mt = mt.annotate_rows(**{name: hl.bind(flatten_struct, *struct_exprs)}) hwe = hl.hardy_weinberg_test(mt[name].homozygote_count[0], mt[name].AC[1] - 2 * mt[name].homozygote_count[1], mt[name].homozygote_count[1]) hwe = hwe.select(het_freq_hwe=hwe.het_freq_hwe, p_value_hwe=hwe.p_value) mt = mt.annotate_rows(**{name: mt[name].annotate(n_not_called=n_samples - mt[name].n_called, call_rate=mt[name].n_called / n_samples, n_het=mt[name].n_called - hl.sum(mt[name].homozygote_count), n_non_ref=mt[name].n_called - mt[name].homozygote_count[0], **hl.cond(hl.len(mt.alleles) == 2, hwe, hl.null(hwe.dtype)))}) return mt
def import_mnv_file(path, **kwargs): column_types = { "AC_mnv_ex": hl.tint, "AC_mnv_gen": hl.tint, "AC_mnv": hl.tint, "AC_snp1_ex": hl.tint, "AC_snp1_gen": hl.tint, "AC_snp1": hl.tint, "AC_snp2_ex": hl.tint, "AC_snp2_gen": hl.tint, "AC_snp2": hl.tint, "AN_snp1_ex": hl.tfloat, "AN_snp1_gen": hl.tfloat, "AN_snp2_ex": hl.tfloat, "AN_snp2_gen": hl.tfloat, "categ": hl.tstr, "filter_snp1_ex": hl.tarray(hl.tstr), "filter_snp1_gen": hl.tarray(hl.tstr), "filter_snp2_ex": hl.tarray(hl.tstr), "filter_snp2_gen": hl.tarray(hl.tstr), "gene_id": hl.tstr, "gene_name": hl.tstr, "locus.contig": hl.tstr, "locus.position": hl.tint, "mnv_amino_acids": hl.tstr, "mnv_codons": hl.tstr, "mnv_consequence": hl.tstr, "mnv_lof": hl.tstr, "mnv": hl.tstr, "n_homhom_ex": hl.tint, "n_homhom_gen": hl.tint, "n_homhom": hl.tint, "n_indv_ex": hl.tint, "n_indv_gen": hl.tint, "n_indv": hl.tint, "snp1_amino_acids": hl.tstr, "snp1_codons": hl.tstr, "snp1_consequence": hl.tstr, "snp1_lof": hl.tstr, "snp1": hl.tstr, "snp2_amino_acids": hl.tstr, "snp2_codons": hl.tstr, "snp2_consequence": hl.tstr, "snp2_lof": hl.tstr, "snp2": hl.tstr, "transcript_id": hl.tstr, } ds = hl.import_table(path, key="mnv", missing="", types=column_types, **kwargs) ds = ds.rename({"mnv": "variant_id"}) ds = ds.transmute(locus=hl.locus(ds["locus.contig"], ds["locus.position"])) ds = ds.transmute(chrom=normalized_contig(ds.locus.contig), pos=ds.locus.position, xpos=x_position(ds.locus),) ds = ds.annotate(ref=ds.variant_id.split("-")[2], alt=ds.variant_id.split("-")[3]) ds = ds.annotate(snp1_copy=ds.snp1, snp2_copy=ds.snp2) ds = ds.transmute( constituent_snvs=[ hl.bind( lambda variant_id_parts: hl.struct( variant_id=ds[f"{snp}_copy"], chrom=variant_id_parts[0], pos=hl.int(variant_id_parts[1]), ref=variant_id_parts[2], alt=variant_id_parts[3], exome=hl.or_missing( hl.is_defined(ds[f"AN_{snp}_ex"]), hl.struct( filters=ds[f"filter_{snp}_ex"], ac=ds[f"AC_{snp}_ex"], an=hl.int(ds[f"AN_{snp}_ex"]), ), ), genome=hl.or_missing( hl.is_defined(ds[f"AN_{snp}_gen"]), hl.struct( filters=ds[f"filter_{snp}_gen"], ac=ds[f"AC_{snp}_gen"], an=hl.int(ds[f"AN_{snp}_gen"]), ), ), ), ds[f"{snp}_copy"].split("-"), ) for snp in ["snp1", "snp2"] ] ) ds = ds.annotate(constituent_snv_ids=[ds.snp1, ds.snp2]) ds = ds.annotate( mnv_in_exome=ds.constituent_snvs.all(lambda s: hl.is_defined(s.exome)), mnv_in_genome=ds.constituent_snvs.all(lambda s: hl.is_defined(s.genome)), ) ds = ds.transmute( n_individuals=ds.n_indv, ac=ds.AC_mnv, ac_hom=ds.n_homhom, exome=hl.or_missing( ds.mnv_in_exome, hl.struct(n_individuals=ds.n_indv_ex, ac=ds.AC_mnv_ex, ac_hom=ds.n_homhom_ex), ), genome=hl.or_missing( ds.mnv_in_genome, hl.struct(n_individuals=ds.n_indv_gen, ac=ds.AC_mnv_gen, ac_hom=ds.n_homhom_gen), ), ) ds = ds.drop("AC_snp1", "AC_snp2") ds = ds.transmute( consequence=hl.struct( category=ds.categ, gene_id=ds.gene_id, gene_name=ds.gene_name, transcript_id=ds.transcript_id, consequence=ds.mnv_consequence, codons=ds.mnv_codons, amino_acids=ds.mnv_amino_acids, lof=ds.mnv_lof, snv_consequences=[ hl.struct( variant_id=ds[f"{snp}"], amino_acids=ds[f"{snp}_amino_acids"], codons=ds[f"{snp}_codons"], consequence=ds[f"{snp}_consequence"], lof=ds[f"{snp}_lof"], ) for snp in ["snp1", "snp2"] ], ) ) # Collapse table to one row per MNV, with all consequences for the MNV collected into an array consequences = ds.group_by(ds.variant_id).aggregate(consequences=hl.agg.collect(ds.consequence)) ds = ds.drop("consequence") ds = ds.distinct() ds = ds.join(consequences) # Sort consequences by severity ds = ds.annotate(consequences=hl.sorted(ds.consequences, key=lambda c: consequence_term_rank(c.consequence),)) ds = ds.annotate( changes_amino_acids_for_snvs=hl.literal([0, 1]) .filter( lambda idx: ds.consequences.any( lambda csq: csq.snv_consequences[idx].amino_acids.lower() != csq.amino_acids.lower() ) ) .map(lambda idx: ds.constituent_snv_ids[idx]) ) return ds
def with_local_a_index(local_a_index): fields = set(old_entry.keys()) def with_pl(pl): new_exprs = {} dropped_fields = ['LA'] if 'LGT' in fields: new_exprs['GT'] = hl.downcode( old_entry.LGT, hl.or_else(local_a_index, hl.len(old_entry.LA))) dropped_fields.append('LGT') if 'LPGT' in fields: new_exprs['PGT'] = hl.downcode( old_entry.LPGT, hl.or_else(local_a_index, hl.len(old_entry.LA))) dropped_fields.append('LPGT') if 'LAD' in fields: new_exprs['AD'] = hl.or_missing( hl.is_defined(old_entry.LAD), [ old_entry.LAD[0], hl.or_else(old_entry.LAD[local_a_index], 0) ]) # second entry zeroed for lack of non-ref AD dropped_fields.append('LAD') if 'LPL' in fields: new_exprs['PL'] = pl if 'GQ' in fields: new_exprs['GQ'] = hl.or_else(hl.gq_from_pl(pl), old_entry.GQ) dropped_fields.append('LPL') return (hl.case().when( hl.len(ds.alleles) == 1, old_entry.annotate( **{ f[1:]: old_entry[f] for f in ['LGT', 'LPGT', 'LAD', 'LPL'] if f in fields }).drop(*dropped_fields)).when( hl.or_else(old_entry.LGT.is_hom_ref(), False), old_entry.annotate( **{ f: old_entry[f'L{f}'] if f in ['GT', 'PGT'] else e for f, e in new_exprs.items() }).drop(*dropped_fields)).default( old_entry.annotate(**new_exprs).drop( *dropped_fields))) if 'LPL' in fields: new_pl = hl.or_missing( hl.is_defined(old_entry.LPL), hl.or_missing( hl.is_defined(local_a_index), hl.range(0, 3).map(lambda i: hl.min( hl.range(0, hl.triangle(hl.len(old_entry.LA))). filter(lambda j: hl.downcode( hl.unphased_diploid_gt_index_call(j), local_a_index) == hl. unphased_diploid_gt_index_call(i)).map( lambda idx: old_entry.LPL[idx]))))) return hl.bind(with_pl, new_pl) else: return with_pl(None)
def de_novo(mt: MatrixTable, pedigree: Pedigree, pop_frequency_prior, *, min_gq: int = 20, min_p: float = 0.05, max_parent_ab: float = 0.05, min_child_ab: float = 0.20, min_dp_ratio: float = 0.10) -> Table: r"""Call putative *de novo* events from trio data. .. include:: ../_templates/req_tstring.rst .. include:: ../_templates/req_tvariant.rst .. include:: ../_templates/req_biallelic.rst Examples -------- Call de novo events: >>> pedigree = hl.Pedigree.read('data/trios.fam') >>> priors = hl.import_table('data/gnomadFreq.tsv', impute=True) >>> priors = priors.transmute(**hl.parse_variant(priors.Variant)).key_by('locus', 'alleles') >>> de_novo_results = hl.de_novo(dataset, pedigree, pop_frequency_prior=priors[dataset.row_key].AF) Notes ----- This method assumes the GATK high-throughput sequencing fields exist: `GT`, `AD`, `DP`, `GQ`, `PL`. This method replicates the functionality of `Kaitlin Samocha's de novo caller <https://github.com/ksamocha/de_novo_scripts>`__. The version corresponding to git commit ``bde3e40`` is implemented in Hail with her permission and assistance. This method produces a :class:`.Table` with the following fields: - `locus` (``locus``) -- Variant locus. - `alleles` (``array<str>``) -- Variant alleles. - `id` (``str``) -- Proband sample ID. - `prior` (``float64``) -- Site frequency prior. It is the maximum of: the computed dataset alternate allele frequency, the `pop_frequency_prior` parameter, and the global prior ``1 / 3e7``. - `proband` (``struct``) -- Proband column fields from `mt`. - `father` (``struct``) -- Father column fields from `mt`. - `mother` (``struct``) -- Mother column fields from `mt`. - `proband_entry` (``struct``) -- Proband entry fields from `mt`. - `father_entry` (``struct``) -- Father entry fields from `mt`. - `proband_entry` (``struct``) -- Mother entry fields from `mt`. - `is_female` (``bool``) -- ``True`` if proband is female. - `p_de_novo` (``float64``) -- Unfiltered posterior probability that the event is *de novo* rather than a missed heterozygous event in a parent. - `confidence` (``str``) Validation confidence. One of: ``'HIGH'``, ``'MEDIUM'``, ``'LOW'``. The key of the table is ``['locus', 'alleles', 'id']``. The model looks for de novo events in which both parents are homozygous reference and the proband is a heterozygous. The model makes the simplifying assumption that when this configuration ``x = (AA, AA, AB)`` of calls occurs, exactly one of the following is true: - ``d``: a de novo mutation occurred in the proband and all calls are accurate. - ``m``: at least one parental allele is actually heterozygous and the proband call is accurate. We can then estimate the posterior probability of a de novo mutation as: .. math:: \mathrm{P_{\text{de novo}}} = \frac{\mathrm{P}(d \mid x)}{\mathrm{P}(d \mid x) + \mathrm{P}(m \mid x)} Applying Bayes rule to the numerator and denominator yields .. math:: \frac{\mathrm{P}(x \mid d)\,\mathrm{P}(d)}{\mathrm{P}(x \mid d)\,\mathrm{P}(d) + \mathrm{P}(x \mid m)\,\mathrm{P}(m)} The prior on de novo mutation is estimated from the rate in the literature: .. math:: \mathrm{P}(d) = \frac{1 \, \text{mutation}}{30{,}000{,}000 \, \text{bases}} The prior used for at least one alternate allele between the parents depends on the alternate allele frequency: .. math:: \mathrm{P}(m) = 1 - (1 - AF)^4 The likelihoods :math:`\mathrm{P}(x \mid d)` and :math:`\mathrm{P}(x \mid m)` are computed from the PL (genotype likelihood) fields using these factorizations: .. math:: \mathrm{P}(x = (AA, AA, AB) \mid d) = \left( \begin{aligned} &\mathrm{P}(x_{\mathrm{father}} = AA \mid \mathrm{father} = AA) \\ {} \cdot {} &\mathrm{P}(x_{\mathrm{mother}} = AA \mid \mathrm{mother} = AA) \\ {} \cdot {} &\mathrm{P}(x_{\mathrm{proband}} = AB \mid \mathrm{proband} = AB) \end{aligned} \right) .. math:: \begin{aligned} \mathrm{P}(x = (AA, AA, AB) \mid m) = &\left( \begin{aligned} &\mathrm{P}(x_{\mathrm{father}} = AA \mid \mathrm{father} = AB) \cdot \mathrm{P}(x_{\mathrm{mother}} = AA \mid \mathrm{mother} = AA) \\ {} + {} &\mathrm{P}(x_{\mathrm{father}} = AA \mid \mathrm{father} = AA) \cdot \mathrm{P}(x_{\mathrm{mother}} = AA \mid \mathrm{mother} = AB) \end{aligned} \right) \\ &{} \cdot \mathrm{P}(x_{\mathrm{proband}} = AB \mid \mathrm{proband} = AB) \end{aligned} (Technically, the second factorization assumes there is exactly (rather than at least) one alternate allele among the parents, which may be justified on the grounds that it is typically the most likely case by far.) While this posterior probability is a good metric for grouping putative de novo mutations by validation likelihood, there exist error modes in high-throughput sequencing data that are not appropriately accounted for by the phred-scaled genotype likelihoods. To this end, a number of hard filters are applied in order to assign validation likelihood. These filters are different for SNPs and insertions/deletions. In the below rules, the following variables are used: - ``DR`` refers to the ratio of the read depth in the proband to the combined read depth in the parents. - ``DP`` refers to the read depth (DP field) of the proband. - ``AB`` refers to the read allele balance of the proband (number of alternate reads divided by total reads). - ``AC`` refers to the count of alternate alleles across all individuals in the dataset at the site. - ``p`` refers to :math:`\mathrm{P_{\text{de novo}}}`. - ``min_p`` refers to the `min_p` function parameter. HIGH-quality SNV: .. code-block:: text (p > 0.99) AND (AB > 0.3) AND (AC == 1) OR (p > 0.99) AND (AB > 0.3) AND (DR > 0.2) OR (p > 0.5) AND (AB > 0.3) AND (AC < 10) AND (DP > 10) MEDIUM-quality SNV: .. code-block:: text (p > 0.5) AND (AB > 0.3) OR (AC == 1) LOW-quality SNV: .. code-block:: text (AB > 0.2) HIGH-quality indel: .. code-block:: text (p > 0.99) AND (AB > 0.3) AND (AC == 1) MEDIUM-quality indel: .. code-block:: text (p > 0.5) AND (AB > 0.3) AND (AC < 10) LOW-quality indel: .. code-block:: text (AB > 0.2) Additionally, de novo candidates are not considered if the proband GQ is smaller than the `min_gq` parameter, if the proband allele balance is lower than the `min_child_ab` parameter, if the depth ratio between the proband and parents is smaller than the `min_depth_ratio` parameter, if the allele balance in a parent is above the `max_parent_ab` parameter, or if the posterior probability `p` is smaller than the `min_p` parameter. Parameters ---------- mt : :class:`.MatrixTable` High-throughput sequencing dataset. pedigree : :class:`.Pedigree` Sample pedigree. pop_frequency_prior : :class:`.Float64Expression` Expression for population alternate allele frequency prior. min_gq Minimum proband GQ to be considered for *de novo* calling. min_p Minimum posterior probability to be considered for *de novo* calling. max_parent_ab Maximum parent allele balance. min_child_ab Minimum proband allele balance/ min_dp_ratio Minimum ratio between proband read depth and parental read depth. Returns ------- :class:`.Table` """ DE_NOVO_PRIOR = 1 / 30000000 MIN_POP_PRIOR = 100 / 30000000 required_entry_fields = {'GT', 'AD', 'DP', 'GQ', 'PL'} missing_fields = required_entry_fields - set(mt.entry) if missing_fields: raise ValueError(f"'de_novo': expected 'MatrixTable' to have at least {required_entry_fields}, " f"missing {missing_fields}") mt = mt.annotate_rows(__prior=pop_frequency_prior, __alt_alleles=hl.agg.sum(mt.GT.n_alt_alleles()), __total_alleles=2 * hl.agg.sum(hl.is_defined(mt.GT))) # subtract 1 from __alt_alleles to correct for the observed genotype mt = mt.annotate_rows(__site_freq=hl.max((mt.__alt_alleles - 1) / mt.__total_alleles, mt.__prior, MIN_POP_PRIOR)) mt = require_biallelic(mt, 'de_novo') # FIXME check that __site_freq is between 0 and 1 when possible in expr tm = trio_matrix(mt, pedigree, complete_trios=True) autosomal = tm.locus.in_autosome_or_par() | (tm.locus.in_x_nonpar() & tm.is_female) hemi_x = tm.locus.in_x_nonpar() & ~tm.is_female hemi_y = tm.locus.in_y_nonpar() & ~tm.is_female hemi_mt = tm.locus.in_mito() & tm.is_female is_snp = hl.is_snp(tm.alleles[0], tm.alleles[1]) n_alt_alleles = tm.__alt_alleles prior = tm.__site_freq het_hom_hom = tm.proband_entry.GT.is_het() & tm.father_entry.GT.is_hom_ref() & tm.mother_entry.GT.is_hom_ref() kid_ad_fail = tm.proband_entry.AD[1] / hl.sum(tm.proband_entry.AD) < min_child_ab failure = hl.null(hl.tstruct(p_de_novo=hl.tfloat64, confidence=hl.tstr)) kid = tm.proband_entry dad = tm.father_entry mom = tm.mother_entry kid_linear_pl = 10 ** (-kid.PL / 10) kid_pp = hl.bind(lambda x: x / hl.sum(x), kid_linear_pl) dad_linear_pl = 10 ** (-dad.PL / 10) dad_pp = hl.bind(lambda x: x / hl.sum(x), dad_linear_pl) mom_linear_pl = 10 ** (-mom.PL / 10) mom_pp = hl.bind(lambda x: x / hl.sum(x), mom_linear_pl) kid_ad_ratio = kid.AD[1] / hl.sum(kid.AD) dp_ratio = kid.DP / (dad.DP + mom.DP) def call_auto(kid_pp, dad_pp, mom_pp, kid_ad_ratio): p_data_given_dn = dad_pp[0] * mom_pp[0] * kid_pp[1] * DE_NOVO_PRIOR p_het_in_parent = 1 - (1 - prior) ** 4 p_data_given_missed_het = (dad_pp[1] * mom_pp[0] + dad_pp[0] * mom_pp[1]) * kid_pp[1] * p_het_in_parent p_de_novo = p_data_given_dn / (p_data_given_dn + p_data_given_missed_het) def solve(p_de_novo): return ( hl.case() .when(kid.GQ < min_gq, failure) .when((kid.DP / (dad.DP + mom.DP) < min_dp_ratio) | ~(kid_ad_ratio >= min_child_ab), failure) .when((hl.sum(mom.AD) == 0) | (hl.sum(dad.AD) == 0), failure) .when((mom.AD[1] / hl.sum(mom.AD) > max_parent_ab) | (dad.AD[1] / hl.sum(dad.AD) > max_parent_ab), failure) .when(p_de_novo < min_p, failure) .when(~is_snp, hl.case() .when((p_de_novo > 0.99) & (kid_ad_ratio > 0.3) & (n_alt_alleles == 1), hl.struct(p_de_novo=p_de_novo, confidence='HIGH')) .when((p_de_novo > 0.5) & (kid_ad_ratio > 0.3) & (n_alt_alleles <= 5), hl.struct(p_de_novo=p_de_novo, confidence='MEDIUM')) .when(kid_ad_ratio > 0.2, hl.struct(p_de_novo=p_de_novo, confidence='LOW')) .or_missing()) .default(hl.case() .when(((p_de_novo > 0.99) & (kid_ad_ratio > 0.3) & (dp_ratio > 0.2)) | ((p_de_novo > 0.99) & (kid_ad_ratio > 0.3) & (n_alt_alleles == 1)) | ((p_de_novo > 0.5) & (kid_ad_ratio > 0.3) & (n_alt_alleles < 10) & (kid.DP > 10)), hl.struct(p_de_novo=p_de_novo, confidence='HIGH')) .when((p_de_novo > 0.5) & ((kid_ad_ratio > 0.3) | (n_alt_alleles == 1)), hl.struct(p_de_novo=p_de_novo, confidence='MEDIUM')) .when(kid_ad_ratio > 0.2, hl.struct(p_de_novo=p_de_novo, confidence='LOW')) .or_missing() ) ) return hl.bind(solve, p_de_novo) def call_hemi(kid_pp, parent, parent_pp, kid_ad_ratio): p_data_given_dn = parent_pp[0] * kid_pp[1] * DE_NOVO_PRIOR p_het_in_parent = 1 - (1 - prior) ** 4 p_data_given_missed_het = (parent_pp[1] + parent_pp[2]) * kid_pp[2] * p_het_in_parent p_de_novo = p_data_given_dn / (p_data_given_dn + p_data_given_missed_het) def solve(p_de_novo): return ( hl.case() .when(kid.GQ < min_gq, failure) .when((kid.DP / (parent.DP) < min_dp_ratio) | (kid_ad_ratio < min_child_ab), failure) .when((hl.sum(parent.AD) == 0), failure) .when(parent.AD[1] / hl.sum(parent.AD) > max_parent_ab, failure) .when(p_de_novo < min_p, failure) .when(~is_snp, hl.case() .when((p_de_novo > 0.99) & (kid_ad_ratio > 0.3) & (n_alt_alleles == 1), hl.struct(p_de_novo=p_de_novo, confidence='HIGH')) .when((p_de_novo > 0.5) & (kid_ad_ratio > 0.3) & (n_alt_alleles <= 5), hl.struct(p_de_novo=p_de_novo, confidence='MEDIUM')) .when(kid_ad_ratio > 0.3, hl.struct(p_de_novo=p_de_novo, confidence='LOW')) .or_missing()) .default(hl.case() .when(((p_de_novo > 0.99) & (kid_ad_ratio > 0.3) & (dp_ratio > 0.2)) | ((p_de_novo > 0.99) & (kid_ad_ratio > 0.3) & (n_alt_alleles == 1)) | ((p_de_novo > 0.5) & (kid_ad_ratio > 0.3) & (n_alt_alleles < 10) & (kid.DP > 10)), hl.struct(p_de_novo=p_de_novo, confidence='HIGH')) .when((p_de_novo > 0.5) & ((kid_ad_ratio > 0.3) | (n_alt_alleles == 1)), hl.struct(p_de_novo=p_de_novo, confidence='MEDIUM')) .when(kid_ad_ratio > 0.2, hl.struct(p_de_novo=p_de_novo, confidence='LOW')) .or_missing() ) ) return hl.bind(solve, p_de_novo) de_novo_call = ( hl.case() .when(~het_hom_hom | kid_ad_fail, failure) .when(autosomal, hl.bind(call_auto, kid_pp, dad_pp, mom_pp, kid_ad_ratio)) .when(hemi_x | hemi_mt, hl.bind(call_hemi, kid_pp, mom, mom_pp, kid_ad_ratio)) .when(hemi_y, hl.bind(call_hemi, kid_pp, dad, dad_pp, kid_ad_ratio)) .or_missing() ) tm = tm.annotate_entries(__call=de_novo_call) tm = tm.filter_entries(hl.is_defined(tm.__call)) entries = tm.entries() return (entries.select('__site_freq', 'proband', 'father', 'mother', 'proband_entry', 'father_entry', 'mother_entry', 'is_female', **entries.__call) .rename({'__site_freq': 'prior'}))
# These fields contain float values but are stored as strings CONVERT_TO_FLOAT_FIELDS = [ "ESP_AF_POPMAX", "ESP_AF_GLOBAL", "KG_AF_POPMAX", "KG_AF_GLOBAL" ] # Convert "NA" and empty strings into null values # Convert fields in chunks to avoid "Method code too large" errors for i in range(0, len(SELECT_INFO_FIELDS), 10): mt = mt.annotate_rows(info=mt.info.annotate( **{ field: hl.or_missing( hl.is_defined(mt.info[field]), hl.bind( lambda value: hl.cond( (value == "") | (value == "NA"), hl.null(mt.info[field].dtype), mt.info[field]), hl.str(mt.info[field]), ), ) for field in SELECT_INFO_FIELDS[i:i + 10] })) # Convert field types mt = mt.annotate_rows(info=mt.info.annotate( **{ field: hl.cond(mt.info[field] == "", hl.null(hl.tint), hl.int(mt.info[field])) for field in CONVERT_TO_INT_FIELDS })) mt = mt.annotate_rows(info=mt.info.annotate( **{
def transform_entries(old_entry): def with_local_a_index(local_a_index): new_pl = hl.or_missing( hl.is_defined(old_entry.LPL), hl.or_missing( hl.is_defined(local_a_index), hl.range(0, 3).map(lambda i: hl.min( hl.range(0, hl.triangle(hl.len(old_entry.LA))). filter(lambda j: hl.downcode( hl.unphased_diploid_gt_index_call(j), local_a_index ) == hl.unphased_diploid_gt_index_call(i)).map( lambda idx: old_entry.LPL[idx]))))) fields = set(old_entry.keys()) def with_pl(pl): new_exprs = {} dropped_fields = ['LA'] if 'LGT' in fields: new_exprs['GT'] = hl.downcode( old_entry.LGT, hl.or_else(local_a_index, hl.len(old_entry.LA))) dropped_fields.append('LGT') if 'LPGT' in fields: new_exprs['PGT'] = hl.downcode( old_entry.LPGT, hl.or_else(local_a_index, hl.len(old_entry.LA))) dropped_fields.append('LPGT') if 'LAD' in fields: new_exprs['AD'] = hl.or_missing( hl.is_defined(old_entry.LAD), [ old_entry.LAD[0], hl.or_else(old_entry.LAD[local_a_index], 0) ]) # second entry zeroed for lack of non-ref AD dropped_fields.append('LAD') if 'LPL' in fields: new_exprs['PL'] = pl if 'GQ' in fields: new_exprs['GQ'] = hl.or_else(hl.gq_from_pl(pl), old_entry.GQ) dropped_fields.append('LPL') return hl.cond( hl.len(ds.alleles) == 1, old_entry.annotate( **{ f[1:]: old_entry[f] for f in ['LGT', 'LPGT', 'LAD', 'LPL'] if f in fields }).drop(*dropped_fields), old_entry.annotate(**new_exprs).drop(*dropped_fields)) if 'LPL' in fields: return hl.bind(with_pl, new_pl) else: return with_pl(None) lai = hl.fold( lambda accum, elt: hl.cond(old_entry.LA[elt] == ds[new_id].a_index, elt, accum), hl.null(hl.tint32), hl.range(0, hl.len(old_entry.LA))) return hl.bind(with_local_a_index, lai)
def combine(ts): def merge_alleles(alleles): from hail.expr.functions import _num_allele_type, _allele_ints return hl.rbind( alleles.map(lambda a: hl.or_else(a[0], '')) .fold(lambda s, t: hl.cond(hl.len(s) > hl.len(t), s, t), ''), lambda ref: hl.rbind( alleles.map( lambda al: hl.rbind( al[0], lambda r: hl.array([ref]).extend( al[1:].map( lambda a: hl.rbind( _num_allele_type(r, a), lambda at: hl.cond( (_allele_ints['SNP'] == at) | (_allele_ints['Insertion'] == at) | (_allele_ints['Deletion'] == at) | (_allele_ints['MNP'] == at) | (_allele_ints['Complex'] == at), a + ref[hl.len(r):], a)))))), lambda lal: hl.struct( globl=hl.array([ref]).extend(hl.array(hl.set(hl.flatten(lal)).remove(ref))), local=lal))) def renumber_entry(entry, old_to_new) -> StructExpression: # global index of alternate (non-ref) alleles return entry.annotate(LA=entry.LA.map(lambda lak: old_to_new[lak])) if (ts.row.dtype, ts.globals.dtype) not in _merge_function_map: f = hl.experimental.define_function( lambda row, gbl: hl.rbind( merge_alleles(row.data.map(lambda d: d.alleles)), lambda alleles: hl.struct( locus=row.locus, alleles=alleles.globl, rsid=hl.find(hl.is_defined, row.data.map(lambda d: d.rsid)), __entries=hl.bind( lambda combined_allele_index: hl.range(0, hl.len(row.data)).flatmap( lambda i: hl.cond(hl.is_missing(row.data[i].__entries), hl.range(0, hl.len(gbl.g[i].__cols)) .map(lambda _: hl.null(row.data[i].__entries.dtype.element_type)), hl.bind( lambda old_to_new: row.data[i].__entries.map( lambda e: renumber_entry(e, old_to_new)), hl.range(0, hl.len(alleles.local[i])).map( lambda j: combined_allele_index[alleles.local[i][j]])))), hl.dict(hl.range(0, hl.len(alleles.globl)).map( lambda j: hl.tuple([alleles.globl[j], j])))))), ts.row.dtype, ts.globals.dtype) _merge_function_map[(ts.row.dtype, ts.globals.dtype)] = f merge_function = _merge_function_map[(ts.row.dtype, ts.globals.dtype)] ts = Table(TableMapRows(ts._tir, Apply(merge_function._name, TopLevelReference('row'), TopLevelReference('global')))) return ts.transmute_globals(__cols=hl.flatten(ts.g.map(lambda g: g.__cols)))
def compute_stratified_metrics_filter( ht: hl.Table, qc_metrics: Dict[str, hl.expr.NumericExpression], strata: Optional[Dict[str, hl.expr.Expression]] = None, lower_threshold: float = 4.0, upper_threshold: float = 4.0, metric_threshold: Optional[Dict[str, Tuple[float, float]]] = None, filter_name: str = "qc_metrics_filters", ) -> hl.Table: """ Compute median, MAD, and upper and lower thresholds for each metric used in outlier filtering :param ht: HT containing relevant sample QC metric annotations :param qc_metrics: list of metrics (name and expr) for which to compute the critical values for filtering outliers :param strata: List of annotations used for stratification. These metrics should be discrete types! :param lower_threshold: Lower MAD threshold :param upper_threshold: Upper MAD threshold :param metric_threshold: Can be used to specify different (lower, upper) thresholds for one or more metrics :param filter_name: Name of resulting filters annotation :return: Table grouped by strata, with upper and lower threshold values computed for each sample QC metric """ _metric_threshold = { metric: (lower_threshold, upper_threshold) for metric in qc_metrics } if metric_threshold is not None: _metric_threshold.update(metric_threshold) def make_filters_expr(ht: hl.Table, qc_metrics: Iterable[str]) -> hl.expr.SetExpression: return hl.set( hl.filter( lambda x: hl.is_defined(x), [ hl.or_missing(ht[f"fail_{metric}"], metric) for metric in qc_metrics ], )) if strata is None: strata = {} ht = ht.select(**qc_metrics, **strata).key_by("s").persist() agg_expr = hl.struct( **{ metric: hl.bind( lambda x: x.annotate( lower=x.median - _metric_threshold[metric][0] * x.mad, upper=x.median + _metric_threshold[metric][1] * x.mad, ), get_median_and_mad_expr(ht[metric]), ) for metric in qc_metrics }) if strata: ht = ht.annotate_globals(qc_metrics_stats=ht.aggregate( hl.agg.group_by(hl.tuple([ht[x] for x in strata]), agg_expr), _localize=False, )) metrics_stats_expr = ht.qc_metrics_stats[hl.tuple( [ht[x] for x in strata])] else: ht = ht.annotate_globals( qc_metrics_stats=ht.aggregate(agg_expr, _localize=False)) metrics_stats_expr = ht.qc_metrics_stats fail_exprs = { f"fail_{metric}": (ht[metric] <= metrics_stats_expr[metric].lower) | (ht[metric] >= metrics_stats_expr[metric].upper) for metric in qc_metrics } ht = ht.transmute(**fail_exprs) stratified_filters = make_filters_expr(ht, qc_metrics) return ht.annotate(**{filter_name: stratified_filters})
def combine(ts): def merge_alleles(alleles): from hail.expr.functions import _num_allele_type, _allele_ints return hl.rbind( alleles.map(lambda a: hl.or_else(a[0], '')) .fold(lambda s, t: hl.cond(hl.len(s) > hl.len(t), s, t), ''), lambda ref: hl.rbind( alleles.map( lambda al: hl.rbind( al[0], lambda r: hl.array([ref]).extend( al[1:].map( lambda a: hl.rbind( _num_allele_type(r, a), lambda at: hl.cond( (_allele_ints['SNP'] == at) | (_allele_ints['Insertion'] == at) | (_allele_ints['Deletion'] == at) | (_allele_ints['MNP'] == at) | (_allele_ints['Complex'] == at), a + ref[hl.len(r):], a)))))), lambda lal: hl.struct( globl=hl.array([ref]).extend(hl.array(hl.set(hl.flatten(lal)).remove(ref))), local=lal))) def renumber_entry(entry, old_to_new) -> StructExpression: # global index of alternate (non-ref) alleles return entry.annotate(LA=entry.LA.map(lambda lak: old_to_new[lak])) if (ts.row.dtype, ts.globals.dtype) not in _merge_function_map: f = hl.experimental.define_function( lambda row, gbl: hl.rbind( merge_alleles(row.data.map(lambda d: d.alleles)), lambda alleles: hl.struct( locus=row.locus, alleles=alleles.globl, rsid=hl.find(hl.is_defined, row.data.map(lambda d: d.rsid)), __entries=hl.bind( lambda combined_allele_index: hl.range(0, hl.len(row.data)).flatmap( lambda i: hl.cond(hl.is_missing(row.data[i].__entries), hl.range(0, hl.len(gbl.g[i].__cols)) .map(lambda _: hl.null(row.data[i].__entries.dtype.element_type)), hl.bind( lambda old_to_new: row.data[i].__entries.map( lambda e: renumber_entry(e, old_to_new)), hl.range(0, hl.len(alleles.local[i])).map( lambda j: combined_allele_index[alleles.local[i][j]])))), hl.dict(hl.range(0, hl.len(alleles.globl)).map( lambda j: hl.tuple([alleles.globl[j], j])))))), ts.row.dtype, ts.globals.dtype) _merge_function_map[(ts.row.dtype, ts.globals.dtype)] = f merge_function = _merge_function_map[(ts.row.dtype, ts.globals.dtype)] ts = Table(TableMapRows(ts._tir, Apply(merge_function._name, merge_function._ret_type, TopLevelReference('row'), TopLevelReference('global')))) return ts.transmute_globals(__cols=hl.flatten(ts.g.map(lambda g: g.__cols)))
def fs_from_sb( sb: Union[hl.expr.ArrayNumericExpression, hl.expr.ArrayExpression], normalize: bool = True, min_cell_count: int = 200, min_count: int = 4, min_p_value: float = 1e-320, ) -> hl.expr.Int64Expression: """ Computes `FS` (Fisher strand balance) annotation from the `SB` (strand balance table) field. `FS` is the phred-scaled value of the double-sided Fisher exact test on strand balance. Using default values will have the same behavior as the GATK implementation, that is: - If sum(counts) > 2*`min_cell_count` (default to GATK value of 200), they are normalized - If sum(counts) < `min_count` (default to GATK value of 4), returns missing - Any p-value < `min_p_value` (default to GATK value of 1e-320) is truncated to that value In addition to the default GATK behavior, setting `normalize` to `False` will perform a chi-squared test for large counts (> `min_cell_count`) instead of normalizing the cell values. .. note:: This function can either take - an array of length containing the table counts: [ref fwd, ref rev, alt fwd, alt rev] - an array containig 2 arrays of length 2, containing the counts: [[ref fwd, ref rev], [alt fwd, alt rev]] GATK code here: https://github.com/broadinstitute/gatk/blob/master/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/FisherStrand.java :param sb: Count of ref/alt reads on each strand :param normalize: Whether to normalize counts is sum(counts) > min_cell_count (normalize=True), or use a chi sq instead of FET (normalize=False) :param min_cell_count: Maximum count for performing a FET :param min_count: Minimum total count to output FS (otherwise null it output) :return: FS value """ if not isinstance(sb, hl.expr.ArrayNumericExpression): sb = hl.bind(lambda x: hl.flatten(x), sb) sb_sum = hl.bind(lambda x: hl.sum(x), sb) # Normalize table if counts get too large if normalize: fs_expr = hl.bind( lambda sb, sb_sum: hl.cond( sb_sum <= 2 * min_cell_count, sb, sb.map(lambda x: hl.int(x / (sb_sum / min_cell_count))), ), sb, sb_sum, ) # FET fs_expr = to_phred( hl.max( hl.fisher_exact_test( fs_expr[0], fs_expr[1], fs_expr[2], fs_expr[3] ).p_value, min_p_value, ) ) else: fs_expr = to_phred( hl.max( hl.contingency_table_test( sb[0], sb[1], sb[2], sb[3], min_cell_count=min_cell_count ).p_value, min_p_value, ) ) # Return null if counts <= `min_count` return hl.or_missing( sb_sum > min_count, hl.max(0, fs_expr) # Needed to avoid -0.0 values )
] # These fields contain float values but are stored as strings CONVERT_TO_FLOAT_FIELDS = ["ESP_AF_POPMAX", "ESP_AF_GLOBAL", "KG_AF_POPMAX", "KG_AF_GLOBAL"] # Convert "NA" and empty strings into null values # Convert fields in chunks to avoid "Method code too large" errors for i in range(0, len(SELECT_INFO_FIELDS), 10): mt = mt.annotate_rows( info=mt.info.annotate( **{ field: hl.or_missing( hl.is_defined(mt.info[field]), hl.bind( lambda value: hl.cond( (value == "") | (value == "NA"), hl.null(mt.info[field].dtype), mt.info[field] ), hl.str(mt.info[field]), ), ) for field in SELECT_INFO_FIELDS[i : i + 10] } ) ) # Convert field types mt = mt.annotate_rows( info=mt.info.annotate( **{ field: hl.cond(mt.info[field] == "", hl.null(hl.tint), hl.int(mt.info[field])) for field in CONVERT_TO_INT_FIELDS }
def transmission_disequilibrium_test(dataset, pedigree): """Performs the transmission disequilibrium test on trios. .. include:: ../_templates/req_tstring.rst .. include:: ../_templates/req_tvariant.rst .. include:: ../_templates/req_biallelic.rst Examples -------- Compute TDT association statistics and show the first two results: .. testsetup:: tdt_dataset = hl.import_vcf('data/tdt_tiny.vcf') .. doctest:: >>> pedigree = hl.Pedigree.read('data/tdt_trios.fam') >>> tdt_table = hl.transmission_disequilibrium_test(tdt_dataset, pedigree) >>> tdt_table.show(2) +---------------+------------+-------+-------+-------------+-------------+ | locus | alleles | t | u | chi2 | p_values | +---------------+------------+-------+-------+-------------+-------------+ | locus<GRCh37> | array<str> | int32 | int32 | float64 | float64 | +---------------+------------+-------+-------+-------------+-------------+ | 1:246714629 | ["C","A"] | 0 | 4 | 4.00000e+00 | 4.55003e-02 | | 2:167262169 | ["T","C"] | NA | NA | NA | NA | +---------------+------------+-------+-------+-------------+-------------+ Export variants with p-values below 0.001: >>> tdt_table = tdt_table.filter(tdt_table.p_value < 0.001) >>> tdt_table.export("output/tdt_results.tsv") Notes ----- The `transmission disequilibrium test <https://en.wikipedia.org/wiki/Transmission_disequilibrium_test#The_case_of_trios:_one_affected_child_per_family>`__ compares the number of times the alternate allele is transmitted (t) versus not transmitted (u) from a heterozgyous parent to an affected child. The null hypothesis holds that each case is equally likely. The TDT statistic is given by .. math:: (t - u)^2 \over (t + u) and asymptotically follows a chi-squared distribution with one degree of freedom under the null hypothesis. :func:`transmission_disequilibrium_test` only considers complete trios (two parents and a proband with defined sex) and only returns results for the autosome, as defined by :meth:`~hail.genetics.Locus.in_autosome`, and chromosome X. Transmissions and non-transmissions are counted only for the configurations of genotypes and copy state in the table below, in order to filter out Mendel errors and configurations where transmission is guaranteed. The copy state of a locus with respect to a trio is defined as follows: - Auto -- in autosome or in PAR of X or female child - HemiX -- in non-PAR of X and male child Here PAR is the `pseudoautosomal region <https://en.wikipedia.org/wiki/Pseudoautosomal_region>`__ of X and Y defined by :class:`.ReferenceGenome`, which many variant callers map to chromosome X. +--------+--------+--------+------------+---+---+ | Kid | Dad | Mom | Copy State | t | u | +========+========+========+============+===+===+ | HomRef | Het | Het | Auto | 0 | 2 | +--------+--------+--------+------------+---+---+ | HomRef | HomRef | Het | Auto | 0 | 1 | +--------+--------+--------+------------+---+---+ | HomRef | Het | HomRef | Auto | 0 | 1 | +--------+--------+--------+------------+---+---+ | Het | Het | Het | Auto | 1 | 1 | +--------+--------+--------+------------+---+---+ | Het | HomRef | Het | Auto | 1 | 0 | +--------+--------+--------+------------+---+---+ | Het | Het | HomRef | Auto | 1 | 0 | +--------+--------+--------+------------+---+---+ | Het | HomVar | Het | Auto | 0 | 1 | +--------+--------+--------+------------+---+---+ | Het | Het | HomVar | Auto | 0 | 1 | +--------+--------+--------+------------+---+---+ | HomVar | Het | Het | Auto | 2 | 0 | +--------+--------+--------+------------+---+---+ | HomVar | Het | HomVar | Auto | 1 | 0 | +--------+--------+--------+------------+---+---+ | HomVar | HomVar | Het | Auto | 1 | 0 | +--------+--------+--------+------------+---+---+ | HomRef | HomRef | Het | HemiX | 0 | 1 | +--------+--------+--------+------------+---+---+ | HomRef | HomVar | Het | HemiX | 0 | 1 | +--------+--------+--------+------------+---+---+ | HomVar | HomRef | Het | HemiX | 1 | 0 | +--------+--------+--------+------------+---+---+ | HomVar | HomVar | Het | HemiX | 1 | 0 | +--------+--------+--------+------------+---+---+ :func:`tdt` produces a table with the following columns: - `locus` (:class:`.tlocus`) -- Locus. - `alleles` (:class:`.tarray` of :py:data:`.tstr`) -- Alleles. - `t` (:py:data:`.tint32`) -- Number of transmitted alternate alleles. - `u` (:py:data:`.tint32`) -- Number of untransmitted alternate alleles. - `chi2` (:py:data:`.tfloat64`) -- TDT statistic. - `p_value` (:py:data:`.tfloat64`) -- p-value. Parameters ---------- dataset : :class:`.MatrixTable` Dataset. pedigree : :class:`~hail.genetics.Pedigree` Sample pedigree. Returns ------- :class:`.Table` Table of TDT results. """ dataset = require_biallelic(dataset, 'transmission_disequilibrium_test') dataset = dataset.annotate_rows(auto_or_x_par=dataset.locus.in_autosome() | dataset.locus.in_x_par()) dataset = dataset.filter_rows(dataset.auto_or_x_par | dataset.locus.in_x_nonpar()) hom_ref = 0 het = 1 hom_var = 2 auto = 2 hemi_x = 1 # kid, dad, mom, copy, t, u config_counts = [(hom_ref, het, het, auto, 0, 2), (hom_ref, hom_ref, het, auto, 0, 1), (hom_ref, het, hom_ref, auto, 0, 1), (het, het, het, auto, 1, 1), (het, hom_ref, het, auto, 1, 0), (het, het, hom_ref, auto, 1, 0), (het, hom_var, het, auto, 0, 1), (het, het, hom_var, auto, 0, 1), (hom_var, het, het, auto, 2, 0), (hom_var, het, hom_var, auto, 1, 0), (hom_var, hom_var, het, auto, 1, 0), (hom_ref, hom_ref, het, hemi_x, 0, 1), (hom_ref, hom_var, het, hemi_x, 0, 1), (hom_var, hom_ref, het, hemi_x, 1, 0), (hom_var, hom_var, het, hemi_x, 1, 0)] count_map = hl.literal({(c[0], c[1], c[2], c[3]): [c[4], c[5]] for c in config_counts}) tri = trio_matrix(dataset, pedigree, complete_trios=True) # this filter removes mendel error of het father in x_nonpar. It also avoids # building and looking up config in common case that neither parent is het parent_is_valid_het = hl.bind( tri.father_entry.GT.is_het(), lambda father_is_het: (father_is_het & tri.auto_or_x_par) | (tri.mother_entry.GT.is_het() & ~father_is_het)) copy_state = hl.cond(tri.auto_or_x_par | tri.is_female, 2, 1) config = (tri.proband_entry.GT.n_alt_alleles(), tri.father_entry.GT.n_alt_alleles(), tri.mother_entry.GT.n_alt_alleles(), copy_state) tri = tri.annotate_rows(counts=agg.array_sum( agg.filter(parent_is_valid_het, count_map.get(config)))) tab = tri.rows().select('locus', 'alleles', 'counts') tab = tab.transmute(t=tab.counts[0], u=tab.counts[1]) tab = tab.annotate(chi2=((tab.t - tab.u)**2) / (tab.t + tab.u)) tab = tab.annotate(p_value=hl.pchisqtail(tab.chi2, 1.0)) return tab.cache()