def mendel_errors(call, pedigree) -> Tuple[Table, Table, Table, Table]: r"""Find Mendel errors; count per variant, individual and nuclear family. .. include:: ../_templates/req_tstring.rst .. include:: ../_templates/req_tvariant.rst .. include:: ../_templates/req_biallelic.rst Examples -------- Find all violations of Mendelian inheritance in each (dad, mom, kid) trio in a pedigree and return four tables (all errors, errors by family, errors by individual, errors by variant): >>> ped = hl.Pedigree.read('data/trios.fam') >>> all_errors, per_fam, per_sample, per_variant = hl.mendel_errors(dataset['GT'], ped) Export all mendel errors to a text file: >>> all_errors.export('output/all_mendel_errors.tsv') Annotate columns with the number of Mendel errors: >>> annotated_samples = dataset.annotate_cols(mendel=per_sample[dataset.s]) Annotate rows with the number of Mendel errors: >>> annotated_variants = dataset.annotate_rows(mendel=per_variant[dataset.locus, dataset.alleles]) Notes ----- The example above returns four tables, which contain Mendelian violations grouped in various ways. These tables are modeled after the `PLINK mendel formats <https://www.cog-genomics.org/plink2/formats#mendel>`_, resembling the ``.mendel``, ``.fmendel``, ``.imendel``, and ``.lmendel`` formats, respectively. **First table:** all Mendel errors. This table contains one row per Mendel error, keyed by the variant and proband id. - `locus` (:class:`.tlocus`) -- Variant locus, key field. - `alleles` (:class:`.tarray` of :py:data:`.tstr`) -- Variant alleles, key field. - (column key of `dataset`) (:py:data:`.tstr`) -- Proband ID, key field. - `fam_id` (:py:data:`.tstr`) -- Family ID. - `mendel_code` (:py:data:`.tint32`) -- Mendel error code, see below. **Second table:** errors per nuclear family. This table contains one row per nuclear family, keyed by the parents. - `pat_id` (:py:data:`.tstr`) -- Paternal ID. (key field) - `mat_id` (:py:data:`.tstr`) -- Maternal ID. (key field) - `fam_id` (:py:data:`.tstr`) -- Family ID. - `children` (:py:data:`.tint32`) -- Number of children in this nuclear family. - `errors` (:py:data:`.tint64`) -- Number of Mendel errors in this nuclear family. - `snp_errors` (:py:data:`.tint64`) -- Number of Mendel errors at SNPs in this nuclear family. **Third table:** errors per individual. This table contains one row per individual. Each error is counted toward the proband, father, and mother according to the `Implicated` in the table below. - (column key of `dataset`) (:py:data:`.tstr`) -- Sample ID (key field). - `fam_id` (:py:data:`.tstr`) -- Family ID. - `errors` (:py:data:`.tint64`) -- Number of Mendel errors involving this individual. - `snp_errors` (:py:data:`.tint64`) -- Number of Mendel errors involving this individual at SNPs. **Fourth table:** errors per variant. - `locus` (:class:`.tlocus`) -- Variant locus, key field. - `alleles` (:class:`.tarray` of :py:data:`.tstr`) -- Variant alleles, key field. - `errors` (:py:data:`.tint64`) -- Number of Mendel errors in this variant. This method only considers complete trios (two parents and proband with defined sex). The code of each Mendel error is determined by the table below, extending the `Plink classification <https://www.cog-genomics.org/plink2/basic_stats#mendel>`__. In the table, the copy state of a locus with respect to a trio is defined as follows, where PAR is the `pseudoautosomal region <https://en.wikipedia.org/wiki/Pseudoautosomal_region>`__ (PAR) of X and Y defined by the reference genome and the autosome is defined by :meth:`~hail.genetics.Locus.in_autosome`. - Auto -- in autosome or in PAR or female child - HemiX -- in non-PAR of X and male child - HemiY -- in non-PAR of Y and male child `Any` refers to the set \{ HomRef, Het, HomVar, NoCall \} and `~` denotes complement in this set. +------+---------+---------+--------+----------------------------+ | Code | Dad | Mom | Kid | Copy State | Implicated | +======+=========+=========+========+============+===============+ | 1 | HomVar | HomVar | Het | Auto | Dad, Mom, Kid | +------+---------+---------+--------+------------+---------------+ | 2 | HomRef | HomRef | Het | Auto | Dad, Mom, Kid | +------+---------+---------+--------+------------+---------------+ | 3 | HomRef | ~HomRef | HomVar | Auto | Dad, Kid | +------+---------+---------+--------+------------+---------------+ | 4 | ~HomRef | HomRef | HomVar | Auto | Mom, Kid | +------+---------+---------+--------+------------+---------------+ | 5 | HomRef | HomRef | HomVar | Auto | Kid | +------+---------+---------+--------+------------+---------------+ | 6 | HomVar | ~HomVar | HomRef | Auto | Dad, Kid | +------+---------+---------+--------+------------+---------------+ | 7 | ~HomVar | HomVar | HomRef | Auto | Mom, Kid | +------+---------+---------+--------+------------+---------------+ | 8 | HomVar | HomVar | HomRef | Auto | Kid | +------+---------+---------+--------+------------+---------------+ | 9 | Any | HomVar | HomRef | HemiX | Mom, Kid | +------+---------+---------+--------+------------+---------------+ | 10 | Any | HomRef | HomVar | HemiX | Mom, Kid | +------+---------+---------+--------+------------+---------------+ | 11 | HomVar | Any | HomRef | HemiY | Dad, Kid | +------+---------+---------+--------+------------+---------------+ | 12 | HomRef | Any | HomVar | HemiY | Dad, Kid | +------+---------+---------+--------+------------+---------------+ See Also -------- :func:`.mendel_error_code` Parameters ---------- dataset : :class:`.MatrixTable` pedigree : :class:`.Pedigree` Returns ------- (:class:`.Table`, :class:`.Table`, :class:`.Table`, :class:`.Table`) """ source = call._indices.source if not isinstance(source, MatrixTable): raise ValueError( "'mendel_errors': expected 'call' to be an expression of 'MatrixTable', found {}" .format("expression of '{}'".format(source.__class__) if source is not None else 'scalar expression')) source = source.select_entries(__GT=call) dataset = require_biallelic(source, 'mendel_errors') tm = trio_matrix(dataset, pedigree, complete_trios=True) tm = tm.select_entries(mendel_code=hl.mendel_error_code( tm.locus, tm.is_female, tm.father_entry['__GT'], tm.mother_entry['__GT'], tm.proband_entry['__GT'])) ck_name = next(iter(source.col_key)) tm = tm.filter_entries(hl.is_defined(tm.mendel_code)) tm = tm.rename({'id': ck_name}) entries = tm.entries() table1 = entries.select('fam_id', 'mendel_code') fam_counts = (entries.group_by( pat_id=entries.father[ck_name], mat_id=entries.mother[ck_name]).partition_hint( min(entries.n_partitions(), 8)).aggregate( children=hl.len(hl.agg.collect_as_set(entries[ck_name])), errors=hl.agg.count_where(hl.is_defined(entries.mendel_code)), snp_errors=hl.agg.count_where( hl.is_snp(entries.alleles[0], entries.alleles[1]) & hl.is_defined(entries.mendel_code)))) table2 = tm.key_cols_by().cols() table2 = table2.select(pat_id=table2.father[ck_name], mat_id=table2.mother[ck_name], fam_id=table2.fam_id, **fam_counts[table2.father[ck_name], table2.mother[ck_name]]) table2 = table2.key_by('pat_id', 'mat_id').distinct() table2 = table2.annotate(errors=hl.or_else(table2.errors, hl.int64(0)), snp_errors=hl.or_else(table2.snp_errors, hl.int64(0))) # in implicated, idx 0 is dad, idx 1 is mom, idx 2 is child implicated = hl.literal( [ [0, 0, 0], # dummy [1, 1, 1], [1, 1, 1], [1, 0, 1], [0, 1, 1], [0, 0, 1], [1, 0, 1], [0, 1, 1], [0, 0, 1], [0, 1, 1], [0, 1, 1], [1, 0, 1], [1, 0, 1], ], dtype=hl.tarray(hl.tarray(hl.tint64))) table3 = tm.annotate_cols( all_errors=hl.or_else(hl.agg.array_sum(implicated[tm.mendel_code]), [0, 0, 0]), snp_errors=hl.or_else( hl.agg.filter(hl.is_snp(tm.alleles[0], tm.alleles[1]), hl.agg.array_sum(implicated[tm.mendel_code])), [0, 0, 0])).key_cols_by().cols() table3 = table3.select(xs=[ hl.struct( **{ ck_name: table3.father[ck_name], 'fam_id': table3.fam_id, 'errors': table3.all_errors[0], 'snp_errors': table3.snp_errors[0] }), hl.struct( **{ ck_name: table3.mother[ck_name], 'fam_id': table3.fam_id, 'errors': table3.all_errors[1], 'snp_errors': table3.snp_errors[1] }), hl.struct( **{ ck_name: table3.proband[ck_name], 'fam_id': table3.fam_id, 'errors': table3.all_errors[2], 'snp_errors': table3.snp_errors[2] }), ]) table3 = table3.explode('xs') table3 = table3.select(**table3.xs) table3 = (table3.group_by(ck_name, 'fam_id').aggregate( errors=hl.agg.sum(table3.errors), snp_errors=hl.agg.sum(table3.snp_errors)).key_by(ck_name)) table4 = tm.select_rows( errors=hl.agg.count_where(hl.is_defined(tm.mendel_code))).rows() return table1, table2, table3, table4
def mendel_errors(call, pedigree) -> Tuple[Table, Table, Table, Table]: r"""Find Mendel errors; count per variant, individual and nuclear family. .. include:: ../_templates/req_tstring.rst .. include:: ../_templates/req_tvariant.rst .. include:: ../_templates/req_biallelic.rst Examples -------- Find all violations of Mendelian inheritance in each (dad, mom, kid) trio in a pedigree and return four tables (all errors, errors by family, errors by individual, errors by variant): >>> ped = hl.Pedigree.read('data/trios.fam') >>> all_errors, per_fam, per_sample, per_variant = hl.mendel_errors(dataset['GT'], ped) Export all mendel errors to a text file: >>> all_errors.export('output/all_mendel_errors.tsv') Annotate columns with the number of Mendel errors: >>> annotated_samples = dataset.annotate_cols(mendel=per_sample[dataset.s]) Annotate rows with the number of Mendel errors: >>> annotated_variants = dataset.annotate_rows(mendel=per_variant[dataset.locus, dataset.alleles]) Notes ----- The example above returns four tables, which contain Mendelian violations grouped in various ways. These tables are modeled after the `PLINK mendel formats <https://www.cog-genomics.org/plink2/formats#mendel>`_, resembling the ``.mendel``, ``.fmendel``, ``.imendel``, and ``.lmendel`` formats, respectively. **First table:** all Mendel errors. This table contains one row per Mendel error, keyed by the variant and proband id. - `locus` (:class:`.tlocus`) -- Variant locus, key field. - `alleles` (:class:`.tarray` of :py:data:`.tstr`) -- Variant alleles, key field. - (column key of `dataset`) (:py:data:`.tstr`) -- Proband ID, key field. - `fam_id` (:py:data:`.tstr`) -- Family ID. - `mendel_code` (:py:data:`.tint32`) -- Mendel error code, see below. **Second table:** errors per nuclear family. This table contains one row per nuclear family, keyed by the parents. - `pat_id` (:py:data:`.tstr`) -- Paternal ID. (key field) - `mat_id` (:py:data:`.tstr`) -- Maternal ID. (key field) - `fam_id` (:py:data:`.tstr`) -- Family ID. - `children` (:py:data:`.tint32`) -- Number of children in this nuclear family. - `errors` (:py:data:`.tint64`) -- Number of Mendel errors in this nuclear family. - `snp_errors` (:py:data:`.tint64`) -- Number of Mendel errors at SNPs in this nuclear family. **Third table:** errors per individual. This table contains one row per individual. Each error is counted toward the proband, father, and mother according to the `Implicated` in the table below. - (column key of `dataset`) (:py:data:`.tstr`) -- Sample ID (key field). - `fam_id` (:py:data:`.tstr`) -- Family ID. - `errors` (:py:data:`.tint64`) -- Number of Mendel errors involving this individual. - `snp_errors` (:py:data:`.tint64`) -- Number of Mendel errors involving this individual at SNPs. **Fourth table:** errors per variant. - `locus` (:class:`.tlocus`) -- Variant locus, key field. - `alleles` (:class:`.tarray` of :py:data:`.tstr`) -- Variant alleles, key field. - `errors` (:py:data:`.tint64`) -- Number of Mendel errors in this variant. This method only considers complete trios (two parents and proband with defined sex). The code of each Mendel error is determined by the table below, extending the `Plink classification <https://www.cog-genomics.org/plink2/basic_stats#mendel>`__. In the table, the copy state of a locus with respect to a trio is defined as follows, where PAR is the `pseudoautosomal region <https://en.wikipedia.org/wiki/Pseudoautosomal_region>`__ (PAR) of X and Y defined by the reference genome and the autosome is defined by :meth:`~hail.genetics.Locus.in_autosome`. - Auto -- in autosome or in PAR or female child - HemiX -- in non-PAR of X and male child - HemiY -- in non-PAR of Y and male child `Any` refers to the set \{ HomRef, Het, HomVar, NoCall \} and `~` denotes complement in this set. +------+---------+---------+--------+----------------------------+ | Code | Dad | Mom | Kid | Copy State | Implicated | +======+=========+=========+========+============+===============+ | 1 | HomVar | HomVar | Het | Auto | Dad, Mom, Kid | +------+---------+---------+--------+------------+---------------+ | 2 | HomRef | HomRef | Het | Auto | Dad, Mom, Kid | +------+---------+---------+--------+------------+---------------+ | 3 | HomRef | ~HomRef | HomVar | Auto | Dad, Kid | +------+---------+---------+--------+------------+---------------+ | 4 | ~HomRef | HomRef | HomVar | Auto | Mom, Kid | +------+---------+---------+--------+------------+---------------+ | 5 | HomRef | HomRef | HomVar | Auto | Kid | +------+---------+---------+--------+------------+---------------+ | 6 | HomVar | ~HomVar | HomRef | Auto | Dad, Kid | +------+---------+---------+--------+------------+---------------+ | 7 | ~HomVar | HomVar | HomRef | Auto | Mom, Kid | +------+---------+---------+--------+------------+---------------+ | 8 | HomVar | HomVar | HomRef | Auto | Kid | +------+---------+---------+--------+------------+---------------+ | 9 | Any | HomVar | HomRef | HemiX | Mom, Kid | +------+---------+---------+--------+------------+---------------+ | 10 | Any | HomRef | HomVar | HemiX | Mom, Kid | +------+---------+---------+--------+------------+---------------+ | 11 | HomVar | Any | HomRef | HemiY | Dad, Kid | +------+---------+---------+--------+------------+---------------+ | 12 | HomRef | Any | HomVar | HemiY | Dad, Kid | +------+---------+---------+--------+------------+---------------+ See Also -------- :func:`.mendel_error_code` Parameters ---------- dataset : :class:`.MatrixTable` pedigree : :class:`.Pedigree` Returns ------- (:class:`.Table`, :class:`.Table`, :class:`.Table`, :class:`.Table`) """ source = call._indices.source if not isinstance(source, MatrixTable): raise ValueError("'mendel_errors': expected 'call' to be an expression of 'MatrixTable', found {}".format( "expression of '{}'".format(source.__class__) if source is not None else 'scalar expression')) source = source.select_entries(__GT=call) dataset = require_biallelic(source, 'mendel_errors') tm = trio_matrix(dataset, pedigree, complete_trios=True) tm = tm.select_entries(mendel_code=hl.mendel_error_code( tm.locus, tm.is_female, tm.father_entry['__GT'], tm.mother_entry['__GT'], tm.proband_entry['__GT'] )) ck_name = next(iter(source.col_key)) tm = tm.filter_entries(hl.is_defined(tm.mendel_code)) tm = tm.rename({'id' : ck_name}) entries = tm.entries() table1 = entries.select('fam_id', 'mendel_code') fam_counts = ( entries .group_by(pat_id=entries.father[ck_name], mat_id=entries.mother[ck_name]) .partition_hint(min(entries.n_partitions(), 8)) .aggregate(children=hl.len(hl.agg.collect_as_set(entries[ck_name])), errors=hl.agg.count_where(hl.is_defined(entries.mendel_code)), snp_errors=hl.agg.count_where(hl.is_snp(entries.alleles[0], entries.alleles[1]) & hl.is_defined(entries.mendel_code))) ) table2 = tm.key_cols_by().cols() table2 = table2.select(pat_id=table2.father[ck_name], mat_id=table2.mother[ck_name], fam_id=table2.fam_id, **fam_counts[table2.father[ck_name], table2.mother[ck_name]]) table2 = table2.key_by('pat_id', 'mat_id').distinct() table2 = table2.annotate(errors=hl.or_else(table2.errors, hl.int64(0)), snp_errors=hl.or_else(table2.snp_errors, hl.int64(0))) # in implicated, idx 0 is dad, idx 1 is mom, idx 2 is child implicated = hl.literal([ [0, 0, 0], # dummy [1, 1, 1], [1, 1, 1], [1, 0, 1], [0, 1, 1], [0, 0, 1], [1, 0, 1], [0, 1, 1], [0, 0, 1], [0, 1, 1], [0, 1, 1], [1, 0, 1], [1, 0, 1], ], dtype=hl.tarray(hl.tarray(hl.tint64))) table3 = tm.annotate_cols(all_errors=hl.or_else(hl.agg.array_sum(implicated[tm.mendel_code]), [0, 0, 0]), snp_errors=hl.or_else( hl.agg.filter(hl.is_snp(tm.alleles[0], tm.alleles[1]), hl.agg.array_sum(implicated[tm.mendel_code])), [0, 0, 0])).key_cols_by().cols() table3 = table3.select(xs=[ hl.struct(**{ck_name: table3.father[ck_name], 'fam_id': table3.fam_id, 'errors': table3.all_errors[0], 'snp_errors': table3.snp_errors[0]}), hl.struct(**{ck_name: table3.mother[ck_name], 'fam_id': table3.fam_id, 'errors': table3.all_errors[1], 'snp_errors': table3.snp_errors[1]}), hl.struct(**{ck_name: table3.proband[ck_name], 'fam_id': table3.fam_id, 'errors': table3.all_errors[2], 'snp_errors': table3.snp_errors[2]}), ]) table3 = table3.explode('xs') table3 = table3.select(**table3.xs) table3 = (table3.group_by(ck_name, 'fam_id') .aggregate(errors=hl.agg.sum(table3.errors), snp_errors=hl.agg.sum(table3.snp_errors)) .key_by(ck_name)) table4 = tm.select_rows(errors=hl.agg.count_where(hl.is_defined(tm.mendel_code))).rows() return table1, table2, table3, table4