def transform_one(mt: MatrixTable) -> MatrixTable: """transforms a gvcf into a form suitable for combining""" mt = mt.annotate_entries( # local (alt) allele index into global (alt) alleles LA=hl.range(0, hl.len(mt.alleles) - 1), END=mt.info.END, PL=mt['PL'][0:], BaseQRankSum=mt.info['BaseQRankSum'], ClippingRankSum=mt.info['ClippingRankSum'], MQ=mt.info['MQ'], MQRankSum=mt.info['MQRankSum'], ReadPosRankSum=mt.info['ReadPosRankSum'], ) mt = mt.annotate_rows(info=mt.info.annotate( DP=hl.agg.sum(mt.entry.DP), SB=hl.agg.array_sum(mt.entry.SB), ).select( "DP", "MQ_DP", "QUALapprox", "RAW_MQ", "VarDP", "SB", )) mt = mt.drop('SB', 'qual') return mt
def transform_one(mt: MatrixTable) -> MatrixTable: """transforms a gvcf into a form suitable for combining""" mt = mt.annotate_entries( # local (alt) allele index into global (alt) alleles LA=hl.range(0, hl.len(mt.alleles)), END=mt.info.END, BaseQRankSum=mt.info['BaseQRankSum'], ClippingRankSum=mt.info['ClippingRankSum'], MQ=mt.info['MQ'], MQRankSum=mt.info['MQRankSum'], ReadPosRankSum=mt.info['ReadPosRankSum'], ) mt = mt.annotate_rows( info=mt.info.annotate( SB_TABLE=hl.array([ hl.agg.sum(mt.entry.SB[0]), hl.agg.sum(mt.entry.SB[1]), hl.agg.sum(mt.entry.SB[2]), hl.agg.sum(mt.entry.SB[3]), ]) ).select( "MQ_DP", "QUALapprox", "RAW_MQ", "VarDP", "SB_TABLE", )) mt = mt.transmute_entries( LGT=mt.GT, LAD=mt.AD[0:], # requiredness issues :'( LPL=mt.PL[0:], LPGT=mt.PGT) mt = mt.drop('SB', 'qual', 'filters') return mt
def rename_duplicates(dataset, name='unique_id') -> MatrixTable: """Rename duplicate column keys. .. include:: ../_templates/req_tstring.rst Examples -------- >>> renamed = hl.rename_duplicates(dataset).cols() >>> duplicate_samples = (renamed.filter(renamed.s != renamed.unique_id) ... .select() ... .collect()) Notes ----- This method produces a new column field from the string column key by appending a unique suffix ``_N`` as necessary. For example, if the column key "NA12878" appears three times in the dataset, the first will produce "NA12878", the second will produce "NA12878_1", and the third will produce "NA12878_2". The name of this new field is parameterized by `name`. Parameters ---------- dataset : :class:`.MatrixTable` Dataset. name : :obj:`str` Name of new field. Returns ------- :class:`.MatrixTable` """ return MatrixTable(dataset._jvds.renameDuplicates(name))
def rename_duplicates(dataset, name='unique_id') -> MatrixTable: """Rename duplicate column keys. .. include:: ../_templates/req_tstring.rst Examples -------- >>> renamed = hl.rename_duplicates(dataset).cols() >>> duplicate_samples = (renamed.filter(renamed.s != renamed.unique_id) ... .select() ... .collect()) Notes ----- This method produces a new column field from the string column key by appending a unique suffix ``_N`` as necessary. For example, if the column key "NA12878" appears three times in the dataset, the first will produce "NA12878", the second will produce "NA12878_1", and the third will produce "NA12878_2". The name of this new field is parameterized by `name`. Parameters ---------- dataset : :class:`.MatrixTable` Dataset. name : :obj:`str` Name of new field. Returns ------- :class:`.MatrixTable` """ return MatrixTable._from_java(dataset._jmt.renameDuplicates(name))
def read_multiple_matrix_tables(self, paths: 'List[str]', intervals: 'List[hl.Interval]', intervals_type): json_repr = { 'paths': paths, 'intervals': intervals_type._convert_to_json(intervals), 'intervalPointType': intervals_type.element_type.point_type._parsable_string(), } results = self._jhc.backend().pyReadMultipleMatrixTables(json.dumps(json_repr)) return [MatrixTable._from_java(jm) for jm in results]
def window_by_locus(mt: MatrixTable, bp_window_size: int) -> MatrixTable: """Collect arrays of row and entry values from preceding loci. .. include:: ../_templates/req_tlocus.rst .. include:: ../_templates/experimental.rst Examples -------- >>> ds_result = hl.window_by_locus(ds, 3) Notes ----- This method groups each row (variant) with the previous rows in a window of `bp_window_size` base pairs, putting the row values from the previous variants into `prev_rows` (row field of type ``array<struct>``) and entry values from those variants into `prev_entries` (entry field of type ``array<struct>``). The `bp_window_size` argument is inclusive; if `base_pairs` is 2 and the loci are .. code-block:: text 1:100 1:100 1:102 1:102 1:103 2:100 2:101 then the size of `prev_rows` is 0, 1, 2, 3, 2, 0, and 1, respectively (and same for the size of prev_entries). Parameters ---------- mt : :class:`.MatrixTable` Input dataset. bp_window_size : :obj:`int` Base pairs to include in the backwards window (inclusive). Returns ------- :class:`.MatrixTable` """ require_first_key_field_locus(mt, 'window_by_locus') return MatrixTable( hl.ir.MatrixToMatrixApply(mt._mir, { 'name': 'WindowByLocus', 'basePairs': bp_window_size }))
def trio_matrix(dataset, pedigree, complete_trios=False): """Builds and returns a matrix where columns correspond to trios and entries contain genotypes for the trio. .. include:: ../_templates/req_tstring.rst Examples -------- Create a trio matrix: >>> pedigree = hl.Pedigree.read('data/case_control_study.fam') >>> trio_dataset = hl.trio_matrix(dataset, pedigree, complete_trios=True) Notes ----- This method builds a new matrix table with one column per trio. If `complete_trios` is ``True``, then only trios that satisfy :meth:`.Trio.is_complete` are included. In this new dataset, the column identifiers are the sample IDs of the trio probands. The column fields and entries of the matrix are changed in the following ways: The new column fields consist of three structs (`proband`, `father`, `mother`), a Boolean field, and a string field: - proband.id** (:py:data:`.tstr`) - Proband sample ID, same as trio column key. - proband.fields** (:class:`.tstruct`) - Column fields on the proband. - father.id** (:py:data:`.tstr`) - Father sample ID. - father.fields** (:class:`.tstruct`) - Column fields on the father. - mother.id** (:py:data:`.tstr`) - Mother sample ID. - mother.fields** (:class:`.tstruct`) - Column fields on the mother. - is_female** (:py:data:`.tbool`) - Proband is female. ``True`` for female, ``False`` for male, missing if unknown. - **fam_id** (:py:data:`.tstr`) - Family ID. The new entry fields are: - **proband_entry** (:class:`.tstruct`) - Proband entry fields. - **father_entry** (:class:`.tstruct`) - Father entry fields. - **mother_entry** (:class:`.tstruct`) - Mother entry fields. Parameters ---------- pedigree : :class:`.Pedigree` Returns ------- :class:`.MatrixTable` """ return MatrixTable(dataset._jvds.trioMatrix(pedigree._jrep, complete_trios))
def transform_one(mt: MatrixTable) -> MatrixTable: """transforms a gvcf into a form suitable for combining""" mt = mt.annotate_entries( # local (alt) allele index into global (alt) alleles LA=hl.range(0, hl.len(mt.alleles) - 1), END=mt.info.END, PL=mt['PL'][0:], BaseQRankSum=mt.info['BaseQRankSum'], ClippingRankSum=mt.info['ClippingRankSum'], MQ=mt.info['MQ'], MQRankSum=mt.info['MQRankSum'], ReadPosRankSum=mt.info['ReadPosRankSum'], ) # This collects all fields with median combiners into arrays so we can calculate medians # when needed mt = mt.annotate_rows( # now minrep'ed (ref, alt) allele pairs alleles=hl.bind(lambda ref: mt.alleles[1:].map(lambda alt: # minrep <NON_REF> hl.struct(ref=hl.cond(alt == "<NON_REF>", ref[0:1], ref), alt=alt)), mt.alleles[0]), info=mt.info.annotate( SB=hl.agg.array_sum(mt.entry.SB) ).select( "DP", "MQ_DP", "QUALapprox", "RAW_MQ", "VarDP", "SB", )) mt = mt.drop('SB', 'qual') return mt
def transform_one(mt: MatrixTable) -> MatrixTable: """transforms a gvcf into a form suitable for combining""" mt = mt.annotate_entries( # local (alt) allele index into global (alt) alleles LA=hl.range(0, hl.len(mt.alleles) - 1), END=mt.info.END, PL=mt['PL'][0:], BaseQRankSum=mt.info['BaseQRankSum'], ClippingRankSum=mt.info['ClippingRankSum'], MQ=mt.info['MQ'], MQRankSum=mt.info['MQRankSum'], ReadPosRankSum=mt.info['ReadPosRankSum'], ) # This collects all fields with median combiners into arrays so we can calculate medians # when needed mt = mt.annotate_rows( # now minrep'ed (ref, alt) allele pairs alleles=hl.bind( lambda ref: mt.alleles[1:].map( lambda alt: # minrep <NON_REF> hl.struct(ref=hl.cond(alt == "<NON_REF>", ref[0:1], ref), alt=alt)), mt.alleles[0]), info=mt.info.annotate(SB=hl.agg.array_sum(mt.entry.SB)).select( "DP", "MQ_DP", "QUALapprox", "RAW_MQ", "VarDP", "SB", )) mt = mt.drop('SB', 'qual') return mt
def window_by_locus(mt: MatrixTable, bp_window_size: int) -> MatrixTable: """Collect arrays of row and entry values from preceding loci. .. include:: ../_templates/req_tlocus.rst .. include:: ../_templates/experimental.rst Examples -------- >>> ds_result = hl.window_by_locus(ds, 3) Notes ----- This method groups each row (variant) with the previous rows in a window of `bp_window_size` base pairs, putting the row values from the previous variants into `prev_rows` (row field of type ``array<struct>``) and entry values from those variants into `prev_entries` (entry field of type ``array<struct>``). The `bp_window_size` argument is inclusive; if `base_pairs` is 2 and the loci are .. code-block:: text 1:100 1:100 1:102 1:102 1:103 2:100 2:101 then the size of `prev_rows` is 0, 1, 2, 3, 2, 0, and 1, respectively (and same for the size of prev_entries). Parameters ---------- mt : :class:`.MatrixTable` Input dataset. bp_window_size : :obj:`int` Base pairs to include in the backwards window (inclusive). Returns ------- :class:`.MatrixTable` """ require_first_key_field_locus(mt, 'window_by_locus') return MatrixTable._from_java(mt._jmt.windowVariants(bp_window_size))
def trio_matrix(dataset, pedigree, complete_trios=False) -> MatrixTable: """Builds and returns a matrix where columns correspond to trios and entries contain genotypes for the trio. .. include:: ../_templates/req_tstring.rst Examples -------- Create a trio matrix: >>> pedigree = hl.Pedigree.read('data/case_control_study.fam') >>> trio_dataset = hl.trio_matrix(dataset, pedigree, complete_trios=True) Notes ----- This method builds a new matrix table with one column per trio. If `complete_trios` is ``True``, then only trios that satisfy :meth:`.Trio.is_complete` are included. In this new dataset, the column identifiers are the sample IDs of the trio probands. The column fields and entries of the matrix are changed in the following ways: The new column fields consist of three structs (`proband`, `father`, `mother`), a Boolean field, and a string field: - **proband** (:class:`.tstruct`) - Column fields on the proband. - **father** (:class:`.tstruct`) - Column fields on the father. - **mother** (:class:`.tstruct`) - Column fields on the mother. - **id** (:py:data:`.tstr`) - Column key for the proband. - **is_female** (:py:data:`.tbool`) - Proband is female. ``True`` for female, ``False`` for male, missing if unknown. - **fam_id** (:py:data:`.tstr`) - Family ID. The new entry fields are: - **proband_entry** (:class:`.tstruct`) - Proband entry fields. - **father_entry** (:class:`.tstruct`) - Father entry fields. - **mother_entry** (:class:`.tstruct`) - Mother entry fields. Parameters ---------- pedigree : :class:`.Pedigree` Returns ------- :class:`.MatrixTable` """ return MatrixTable._from_java(dataset._jmt.trioMatrix(pedigree._jrep, complete_trios))
def summarize_variants(mt: MatrixTable, show=True): """Summarize the variants present in a dataset and print the results. Examples -------- >>> hl.summarize_variants(dataset) # doctest: +SKIP ============================== Number of variants: 346 ============================== Alleles per variant ------------------- 2 alleles: 346 variants ============================== Variants per contig ------------------- 20: 346 variants ============================== Allele type distribution ------------------------ SNP: 301 alleles Deletion: 27 alleles Insertion: 18 alleles ============================== Parameters ---------- mt : :class:`.MatrixTable` Matrix table with a variant (locus / alleles) row key. show : :obj:`bool` If ``True``, print results instead of returning them. Notes ----- The result returned if `show` is ``False`` is a :class:`.Struct` with four fields: - `n_variants` (:obj:`int`): Number of variants present in the matrix table. - `allele_types` (:obj:`Dict[str, int]`): Number of alternate alleles in each allele allele category. - `contigs` (:obj:`Dict[str, int]`): Number of variants on each contig. - `allele_counts` (:obj:`Dict[int, int]`): Number of variants broken down by number of alleles (biallelic is 2, for example). Returns ------- :obj:`None` or :class:`.Struct` Returns ``None`` if `show` is ``True``, or returns results as a struct. """ require_row_key_variant(mt, 'summarize_variants') alleles_per_variant = hl.range(1, hl.len(mt.alleles)).map(lambda i: hl.allele_type(mt.alleles[0], mt.alleles[i])) allele_types, contigs, allele_counts, n_variants = mt.aggregate_rows( (hl.agg.explode(lambda elt: hl.agg.counter(elt), alleles_per_variant), hl.agg.counter(mt.locus.contig), hl.agg.counter(hl.len(mt.alleles)), hl.agg.count())) rg = mt.locus.dtype.reference_genome contig_idx = {contig: i for i, contig in enumerate(rg.contigs)} if show: max_contig_len = max(len(contig) for contig in contigs) contig_formatter = f'%{max_contig_len}s' max_allele_count_len = max(len(str(x)) for x in allele_counts) allele_count_formatter = f'%{max_allele_count_len}s' max_allele_type_len = max(len(x) for x in allele_types) allele_type_formatter = f'%{max_allele_type_len}s' line_break = '==============================' print(line_break) print(f'Number of variants: {n_variants}') print(line_break) print('Alleles per variant') print('-------------------') for n_alleles, count in sorted(allele_counts.items(), key=lambda x: x[0]): print(f' {allele_count_formatter % n_alleles} alleles: {count} variants') print(line_break) print('Variants per contig') print('-------------------') for contig, count in sorted(contigs.items(), key=lambda x: contig_idx[x[0]]): print(f' {contig_formatter % contig}: {count} variants') print(line_break) print('Allele type distribution') print('------------------------') for allele_type, count in Counter(allele_types).most_common(): print(f' {allele_type_formatter % allele_type}: {count} alternate alleles') print(line_break) else: return hl.Struct(allele_types=allele_types, contigs=contigs, allele_counts=allele_counts, n_variants=n_variants)
def variant_qc(dataset, name='variant_qc') -> MatrixTable: """Compute common variant statistics (quality control metrics). .. include:: ../_templates/req_biallelic.rst .. include:: ../_templates/req_tvariant.rst Examples -------- >>> dataset_result = hl.variant_qc(dataset) Notes ----- This method computes 18 variant statistics from the genotype data, returning a new struct field `name` with the following metrics: +-------------------------+---------+--------------------------------------------------------+ | Name | Type | Description | +=========================+=========+========================================================+ | ``call_rate`` | float64 | Fraction of samples with called genotypes | +-------------------------+---------+--------------------------------------------------------+ | ``AF`` | float64 | Calculated alternate allele frequency (q) | +-------------------------+---------+--------------------------------------------------------+ | ``AC`` | int32 | Count of alternate alleles | +-------------------------+---------+--------------------------------------------------------+ | ``r_heterozygosity`` | float64 | Proportion of heterozygotes | +-------------------------+---------+--------------------------------------------------------+ | ``r_het_hom_var`` | float64 | Ratio of heterozygotes to homozygous alternates | +-------------------------+---------+--------------------------------------------------------+ | ``r_expected_het_freq`` | float64 | Expected r_heterozygosity based on HWE | +-------------------------+---------+--------------------------------------------------------+ | ``p_hwe`` | float64 | p-value from Hardy Weinberg Equilibrium null model | +-------------------------+---------+--------------------------------------------------------+ | ``n_hom_ref`` | int32 | Number of homozygous reference samples | +-------------------------+---------+--------------------------------------------------------+ | ``n_het`` | int32 | Number of heterozygous samples | +-------------------------+---------+--------------------------------------------------------+ | ``n_hom_var`` | int32 | Number of homozygous alternate samples | +-------------------------+---------+--------------------------------------------------------+ | ``n_called`` | int32 | Sum of ``n_hom_ref``, ``n_het``, and ``n_hom_var`` | +-------------------------+---------+--------------------------------------------------------+ | ``n_not_called`` | int32 | Number of uncalled samples | +-------------------------+---------+--------------------------------------------------------+ | ``n_non_ref`` | int32 | Sum of ``n_het`` and ``n_hom_var`` | +-------------------------+---------+--------------------------------------------------------+ | ``dp_mean`` | float64 | Depth mean across all samples | +-------------------------+---------+--------------------------------------------------------+ | ``dp_stdev`` | float64 | Depth standard deviation across all samples | +-------------------------+---------+--------------------------------------------------------+ | ``gq_mean`` | float64 | The average genotype quality across all samples | +-------------------------+---------+--------------------------------------------------------+ | ``gq_stdev`` | float64 | Genotype quality standard deviation across all samples | +-------------------------+---------+--------------------------------------------------------+ Missing values ``NA`` may result from division by zero. The empirical standard deviation is computed with zero degrees of freedom. Parameters ---------- dataset : :class:`.MatrixTable` Dataset. name : :obj:`str` Name for resulting field. Returns ------- :class:`.MatrixTable` Dataset with a new row-indexed field `name`. """ return MatrixTable(Env.hail().methods.VariantQC.apply( require_biallelic(dataset, 'variant_qc')._jvds, name))
def sample_qc(dataset, name='sample_qc') -> MatrixTable: """Compute per-sample metrics useful for quality control. .. include:: ../_templates/req_tvariant.rst Examples -------- Compute sample QC metrics and remove low-quality samples: >>> dataset = hl.sample_qc(dataset, name='sample_qc') >>> filtered_dataset = dataset.filter_cols((dataset.sample_qc.dp_mean > 20) & (dataset.sample_qc.r_ti_tv > 1.5)) Notes ----- This method computes summary statistics per sample from a genetic matrix and stores the results as a new column-indexed field in the matrix, named based on the `name` parameter. +--------------------------+-------+-+------------------------------------------------------+ | Name | Type | Description | +==========================+=========+======================================================+ | ``call_rate`` | float64 | Fraction of calls non-missing | +--------------------------+---------+------------------------------------------------------+ | ``n_hom_ref`` | int64 | Number of homozygous reference calls | +--------------------------+---------+------------------------------------------------------+ | ``n_het`` | int64 | Number of heterozygous calls | +--------------------------+---------+------------------------------------------------------+ | ``n_hom_var`` | int64 | Number of homozygous alternate calls | +--------------------------+---------+------------------------------------------------------+ | ``n_called`` | int64 | Sum of ``n_hom_ref`` + ``n_het`` + ``n_hom_var`` | +--------------------------+---------+------------------------------------------------------+ | ``n_not_called`` | int64 | Number of missing calls | +--------------------------+---------+------------------------------------------------------+ | ``n_snp`` | int64 | Number of SNP alternate alleles | +--------------------------+---------+------------------------------------------------------+ | ``n_insertion`` | int64 | Number of insertion alternate alleles | +--------------------------+---------+------------------------------------------------------+ | ``n_deletion`` | int64 | Number of deletion alternate alleles | +--------------------------+---------+------------------------------------------------------+ | ``n_singleton`` | int64 | Number of private alleles | +--------------------------+---------+------------------------------------------------------+ | ``n_transition`` | int64 | Number of transition (A-G, C-T) alternate alleles | +--------------------------+---------+------------------------------------------------------+ | ``n_transversion`` | int64 | Number of transversion alternate alleles | +--------------------------+---------+------------------------------------------------------+ | ``n_star`` | int64 | Number of star (upstream deletion) alleles | +--------------------------+---------+------------------------------------------------------+ | ``n_non_ref`` | int64 | Sum of ``n_het`` and ``n_hom_var`` | +--------------------------+---------+------------------------------------------------------+ | ``r_ti_tv`` | float64 | Transition/Transversion ratio | +--------------------------+---------+------------------------------------------------------+ | ``r_het_hom_var`` | float64 | Het/HomVar call ratio | +--------------------------+---------+------------------------------------------------------+ | ``r_insertion_deletion`` | float64 | Insertion/Deletion allele ratio | +--------------------------+---------+------------------------------------------------------+ | ``dp_mean`` | float64 | Depth mean across all calls | +--------------------------+---------+------------------------------------------------------+ | ``dp_stdev`` | float64 | Depth standard deviation across all calls | +--------------------------+---------+------------------------------------------------------+ | ``gq_mean`` | float64 | The average genotype quality across all calls | +--------------------------+---------+------------------------------------------------------+ | ``gq_stdev`` | float64 | Genotype quality standard deviation across all calls | +--------------------------+---------+------------------------------------------------------+ Missing values ``NA`` may result from division by zero. The empirical standard deviation is computed with zero degrees of freedom. Parameters ---------- dataset : :class:`.MatrixTable` Dataset. name : :obj:`str` Name for resulting field. Returns ------- :class:`.MatrixTable` Dataset with a new column-indexed field `name`. """ return MatrixTable(Env.hail().methods.SampleQC.apply( require_biallelic(dataset, 'sample_qc')._jvds, name))
def persist_matrix_table(self, mt, storage_level): return MatrixTable._from_java(self._to_java_ir(mt._mir).pyPersist(storage_level))
def from_matrix_table( mt: MatrixTable, entrc_field: str, *, n_partitions: Optional[int] = None, block_size: Optional[int] = None ) -> 'DNDArray': if n_partitions is None: n_partitions = mt.n_partitions() if block_size is None: block_size = DNDArray.default_block_size if n_partitions == 0: assert mt.count_cols() == 0 assert mt.count_rows() == 0 t = range_table(0, 0) t = t.annotate(r=0, c=0, block=nd.array([]).reshape((0, 0))) t = t.select_globals( r_field='r', c_field='c', n_rows=0, n_cols=0, n_block_rows=0, n_block_cols=0, block_size=0) return DNDArray(t) assert 'r' not in mt.row assert 'c' not in mt.row assert 'block' not in mt.row n_rows, n_cols = mt.count() n_block_rows = (n_rows + block_size - 1) // block_size n_block_cols = (n_cols + block_size - 1) // block_size entries, cols, row_index, col_blocks = (Env.get_uid() for _ in range(4)) mt = (mt .select_globals() .select_rows() .select_cols() .add_row_index(row_index) .localize_entries(entries, cols)) # FIXME: remove when ndarray support structs mt = mt.annotate(**{entries: mt[entries][entrc_field]}) mt = mt.annotate( **{col_blocks: hl.range(n_block_cols).map( lambda c: hl.struct( c=c, entries=mt[entries][(c * block_size):((c + 1) * block_size)]))} ) mt = mt.explode(col_blocks) mt = mt.select(row_index, **mt[col_blocks]) mt = mt.annotate(r=hl.int(mt[row_index] // block_size)) mt = mt.key_by(mt.r, mt.c) mt = mt.group_by(mt.r, mt.c).aggregate( entries=hl.sorted( hl.agg.collect(hl.struct(row_index=mt[row_index], entries=mt.entries)), key=lambda x: x.row_index ).map(lambda x: x.entries)) mt = mt.select(block=hl.nd.array(mt.entries)) mt = mt.select_globals( r_field='r', c_field='c', n_rows=n_rows, n_cols=n_cols, n_block_rows=n_block_rows, n_block_cols=n_block_cols, block_size=block_size) fname = new_temp_file() mt = mt.key_by(mt.r, mt.c) mt.write(fname, _codec_spec=DNDArray.fast_codec_spec) t = hl.read_table(fname, _intervals=[ hl.Interval(hl.Struct(r=i, c=j), hl.Struct(r=i, c=j + 1)) for i in range(n_block_rows) for j in range(n_block_cols)]) return DNDArray(t)
def de_novo(mt: MatrixTable, pedigree: Pedigree, pop_frequency_prior, *, min_gq: int = 20, min_p: float = 0.05, max_parent_ab: float = 0.05, min_child_ab: float = 0.20, min_dp_ratio: float = 0.10) -> Table: r"""Call putative *de novo* events from trio data. .. include:: ../_templates/req_tstring.rst .. include:: ../_templates/req_tvariant.rst .. include:: ../_templates/req_biallelic.rst Examples -------- Call de novo events: >>> pedigree = hl.Pedigree.read('data/trios.fam') >>> priors = hl.import_table('data/gnomadFreq.tsv', impute=True) >>> priors = priors.transmute(**hl.parse_variant(priors.Variant)).key_by('locus', 'alleles') >>> de_novo_results = hl.de_novo(dataset, pedigree, pop_frequency_prior=priors[dataset.row_key].AF) Notes ----- This method assumes the GATK high-throughput sequencing fields exist: `GT`, `AD`, `DP`, `GQ`, `PL`. This method replicates the functionality of `Kaitlin Samocha's de novo caller <https://github.com/ksamocha/de_novo_scripts>`__. The version corresponding to git commit ``bde3e40`` is implemented in Hail with her permission and assistance. This method produces a :class:`.Table` with the following fields: - `locus` (``locus``) -- Variant locus. - `alleles` (``array<str>``) -- Variant alleles. - `id` (``str``) -- Proband sample ID. - `prior` (``float64``) -- Site frequency prior. It is the maximum of: the computed dataset alternate allele frequency, the `pop_frequency_prior` parameter, and the global prior ``1 / 3e7``. - `proband` (``struct``) -- Proband column fields from `mt`. - `father` (``struct``) -- Father column fields from `mt`. - `mother` (``struct``) -- Mother column fields from `mt`. - `proband_entry` (``struct``) -- Proband entry fields from `mt`. - `father_entry` (``struct``) -- Father entry fields from `mt`. - `proband_entry` (``struct``) -- Mother entry fields from `mt`. - `is_female` (``bool``) -- ``True`` if proband is female. - `p_de_novo` (``float64``) -- Unfiltered posterior probability that the event is *de novo* rather than a missed heterozygous event in a parent. - `confidence` (``str``) Validation confidence. One of: ``'HIGH'``, ``'MEDIUM'``, ``'LOW'``. The key of the table is ``['locus', 'alleles', 'id']``. The model looks for de novo events in which both parents are homozygous reference and the proband is a heterozygous. The model makes the simplifying assumption that when this configuration ``x = (AA, AA, AB)`` of calls occurs, exactly one of the following is true: - ``d``: a de novo mutation occurred in the proband and all calls are accurate. - ``m``: at least one parental allele is actually heterozygous and the proband call is accurate. We can then estimate the posterior probability of a de novo mutation as: .. math:: \mathrm{P_{\text{de novo}}} = \frac{\mathrm{P}(d\,|\,x)}{\mathrm{P}(d\,|\,x) + \mathrm{P}(m\,|\,x)} Applying Bayes rule to the numerator and denominator yields .. math:: \frac{\mathrm{P}(x\,|\,d)\,\mathrm{P}(d)}{\mathrm{P}(x\,|\,d)\,\mathrm{P}(d) + \mathrm{P}(x\,|\,m)\,\mathrm{P}(m)} The prior on de novo mutation is estimated from the rate in the literature: .. math:: \mathrm{P}(d) = \frac{1 \text{mutation}}{30,000,000\, \text{bases}} The prior used for at least one alternate allele between the parents depends on the alternate allele frequency: .. math:: \mathrm{P}(m) = 1 - (1 - AF)^4 The likelihoods :math:`\mathrm{P}(x\,|\,d)` and :math:`\mathrm{P}(x\,|\,m)` are computed from the PL (genotype likelihood) fields using these factorizations: .. math:: \mathrm{P}(x = (AA, AA, AB) \,|\,d) = \Big( &\mathrm{P}(x_{\mathrm{father}} = AA \,|\, \mathrm{father} = AA) \\ \cdot &\mathrm{P}(x_{\mathrm{mother}} = AA \,|\, \mathrm{mother} = AA) \\ \cdot &\mathrm{P}(x_{\mathrm{proband}} = AB \,|\, \mathrm{proband} = AB) \Big) .. math:: \mathrm{P}(x = (AA, AA, AB) \,|\,m) = \Big( & \mathrm{P}(x_{\mathrm{father}} = AA \,|\, \mathrm{father} = AB) \cdot \mathrm{P}(x_{\mathrm{mother}} = AA \,|\, \mathrm{mother} = AA) \\ + \, &\mathrm{P}(x_{\mathrm{father}} = AA \,|\, \mathrm{father} = AA) \cdot \mathrm{P}(x_{\mathrm{mother}} = AA \,|\, \mathrm{mother} = AB) \Big) \\ \cdot \, &\mathrm{P}(x_{\mathrm{proband}} = AB \,|\, \mathrm{proband} = AB) (Technically, the second factorization assumes there is exactly (rather than at least) one alternate allele among the parents, which may be justified on the grounds that it is typically the most likely case by far.) While this posterior probability is a good metric for grouping putative de novo mutations by validation likelihood, there exist error modes in high-throughput sequencing data that are not appropriately accounted for by the phred-scaled genotype likelihoods. To this end, a number of hard filters are applied in order to assign validation likelihood. These filters are different for SNPs and insertions/deletions. In the below rules, the following variables are used: - ``DR`` refers to the ratio of the read depth in the proband to the combined read depth in the parents. - ``AB`` refers to the read allele balance of the proband (number of alternate reads divided by total reads). - ``AC`` refers to the count of alternate alleles across all individuals in the dataset at the site. - ``p`` refers to :math:`\mathrm{P_{\text{de novo}}}`. - ``min_p`` refers to the ``min_p`` function parameter. HIGH-quality SNV: .. code-block:: text p > 0.99 && AB > 0.3 && DR > 0.2 or p > 0.99 && AB > 0.3 && AC == 1 MEDIUM-quality SNV: .. code-block:: text p > 0.5 && AB > 0.3 or p > 0.5 && AB > 0.2 && AC == 1 LOW-quality SNV: .. code-block:: text p > min_p && AB > 0.2 HIGH-quality indel: .. code-block:: text p > 0.99 && AB > 0.3 && DR > 0.2 or p > 0.99 && AB > 0.3 && AC == 1 MEDIUM-quality indel: .. code-block:: text p > 0.5 && AB > 0.3 or p > 0.5 && AB > 0.2 and AC == 1 LOW-quality indel: .. code-block:: text p > min_p && AB > 0.2 Additionally, de novo candidates are not considered if the proband GQ is smaller than the ``min_gq`` parameter, if the proband allele balance is lower than the ``min_child_ab`` parameter, if the depth ratio between the proband and parents is smaller than the ``min_depth_ratio`` parameter, or if the allele balance in a parent is above the ``max_parent_ab`` parameter. Parameters ---------- mt : :class:`.MatrixTable` High-throughput sequencing dataset. pedigree : :class:`.Pedigree` Sample pedigree. pop_frequency_prior : :class:`.Float64Expression` Expression for population alternate allele frequency prior. min_gq Minimum proband GQ to be considered for *de novo* calling. min_p Minimum posterior probability to be considered for *de novo* calling. max_parent_ab Maximum parent allele balance. min_child_ab Minimum proband allele balance/ min_dp_ratio Minimum ratio between proband read depth and parental read depth. Returns ------- :class:`.Table` """ DE_NOVO_PRIOR = 1 / 30000000 MIN_POP_PRIOR = 100 / 30000000 required_entry_fields = {'GT', 'AD', 'DP', 'GQ', 'PL'} missing_fields = required_entry_fields - set(mt.entry) if missing_fields: raise ValueError( f"'de_novo': expected 'MatrixTable' to have at least {required_entry_fields}, " f"missing {missing_fields}") mt = mt.annotate_rows(__prior=pop_frequency_prior, __alt_alleles=hl.agg.sum(mt.GT.n_alt_alleles()), __total_alleles=2 * hl.agg.sum(hl.is_defined(mt.GT))) # subtract 1 from __alt_alleles to correct for the observed genotype mt = mt.annotate_rows( __site_freq=hl.max((mt.__alt_alleles - 1) / mt.__total_alleles, mt.__prior, MIN_POP_PRIOR)) mt = require_biallelic(mt, 'de_novo') # FIXME check that __site_freq is between 0 and 1 when possible in expr tm = trio_matrix(mt, pedigree, complete_trios=True) autosomal = tm.locus.in_autosome_or_par() | (tm.locus.in_x_nonpar() & tm.is_female) hemi_x = tm.locus.in_x_nonpar() & ~tm.is_female hemi_y = tm.locus.in_y_nonpar() & ~tm.is_female hemi_mt = tm.locus.in_mito() & tm.is_female is_snp = hl.is_snp(tm.alleles[0], tm.alleles[1]) n_alt_alleles = tm.__alt_alleles prior = tm.__site_freq het_hom_hom = tm.proband_entry.GT.is_het() & tm.father_entry.GT.is_hom_ref( ) & tm.mother_entry.GT.is_hom_ref() kid_ad_fail = tm.proband_entry.AD[1] / hl.sum( tm.proband_entry.AD) < min_child_ab failure = hl.null(hl.tstruct(p_de_novo=hl.tfloat64, confidence=hl.tstr)) kid = tm.proband_entry dad = tm.father_entry mom = tm.mother_entry kid_linear_pl = 10**(-kid.PL / 10) kid_pp = hl.bind(lambda x: x / hl.sum(x), kid_linear_pl) dad_linear_pl = 10**(-dad.PL / 10) dad_pp = hl.bind(lambda x: x / hl.sum(x), dad_linear_pl) mom_linear_pl = 10**(-mom.PL / 10) mom_pp = hl.bind(lambda x: x / hl.sum(x), mom_linear_pl) kid_ad_ratio = kid.AD[1] / hl.sum(kid.AD) dp_ratio = kid.DP / (dad.DP + mom.DP) def call_auto(kid_pp, dad_pp, mom_pp, kid_ad_ratio): p_data_given_dn = dad_pp[0] * mom_pp[0] * kid_pp[1] * DE_NOVO_PRIOR p_het_in_parent = 1 - (1 - prior)**4 p_data_given_missed_het = (dad_pp[1] * mom_pp[0] + dad_pp[0] * mom_pp[1]) * kid_pp[1] * p_het_in_parent p_de_novo = p_data_given_dn / (p_data_given_dn + p_data_given_missed_het) def solve(p_de_novo): return (hl.case().when(kid.GQ < min_gq, failure).when( (kid.DP / (dad.DP + mom.DP) < min_dp_ratio) | ~(kid_ad_ratio >= min_child_ab), failure).when( (hl.sum(mom.AD) == 0) | (hl.sum(dad.AD) == 0), failure).when( (mom.AD[1] / hl.sum(mom.AD) > max_parent_ab) | (dad.AD[1] / hl.sum(dad.AD) > max_parent_ab), failure).when(p_de_novo < min_p, failure).when( ~is_snp, hl.case().when( (p_de_novo > 0.99) & (kid_ad_ratio > 0.3) & (n_alt_alleles == 1), hl.struct(p_de_novo=p_de_novo, confidence='HIGH')).when( (p_de_novo > 0.5) & (kid_ad_ratio > 0.3) & (n_alt_alleles <= 5), hl.struct( p_de_novo=p_de_novo, confidence='MEDIUM')).when( (p_de_novo > 0.05) & (kid_ad_ratio > 0.2), hl.struct( p_de_novo=p_de_novo, confidence='LOW')). or_missing()).default(hl.case().when( ((p_de_novo > 0.99) & (kid_ad_ratio > 0.3) & (dp_ratio > 0.2)) | ((p_de_novo > 0.99) & (kid_ad_ratio > 0.3) & (n_alt_alleles == 1)) | ((p_de_novo > 0.5) & (kid_ad_ratio > 0.3) & (n_alt_alleles < 10) & (kid.DP > 10)), hl.struct(p_de_novo=p_de_novo, confidence='HIGH')).when( (p_de_novo > 0.5) & ((kid_ad_ratio > 0.3) | (n_alt_alleles == 1)), hl.struct( p_de_novo=p_de_novo, confidence='MEDIUM')).when( (p_de_novo > 0.05) & (kid_ad_ratio > 0.2), hl.struct( p_de_novo=p_de_novo, confidence='LOW')). or_missing())) return hl.bind(solve, p_de_novo) def call_hemi(kid_pp, parent, parent_pp, kid_ad_ratio): p_data_given_dn = parent_pp[0] * kid_pp[1] * DE_NOVO_PRIOR p_het_in_parent = 1 - (1 - prior)**4 p_data_given_missed_het = (parent_pp[1] + parent_pp[2]) * kid_pp[2] * p_het_in_parent p_de_novo = p_data_given_dn / (p_data_given_dn + p_data_given_missed_het) def solve(p_de_novo): return (hl.case().when(kid.GQ < min_gq, failure).when( (kid.DP / (parent.DP) < min_dp_ratio) | (kid_ad_ratio < min_child_ab), failure).when((hl.sum(parent.AD) == 0), failure).when( parent.AD[1] / hl.sum(parent.AD) > max_parent_ab, failure).when(p_de_novo < min_p, failure).when( ~is_snp, hl.case().when( (p_de_novo > 0.99) & (kid_ad_ratio > 0.3) & (n_alt_alleles == 1), hl.struct( p_de_novo=p_de_novo, confidence='HIGH')).when( (p_de_novo > 0.5) & (kid_ad_ratio > 0.3) & (n_alt_alleles <= 5), hl.struct(p_de_novo=p_de_novo, confidence='MEDIUM')).when( (p_de_novo > 0.05) & (kid_ad_ratio > 0.3), hl.struct( p_de_novo=p_de_novo, confidence='LOW')). or_missing()).default( hl.case().when( ((p_de_novo > 0.99) & (kid_ad_ratio > 0.3) & (dp_ratio > 0.2)) | ((p_de_novo > 0.99) & (kid_ad_ratio > 0.3) & (n_alt_alleles == 1)) | ((p_de_novo > 0.5) & (kid_ad_ratio > 0.3) & (n_alt_alleles < 10) & (kid.DP > 10)), hl.struct(p_de_novo=p_de_novo, confidence='HIGH')).when( (p_de_novo > 0.5) & ((kid_ad_ratio > 0.3) | (n_alt_alleles == 1)), hl.struct(p_de_novo=p_de_novo, confidence='MEDIUM')). when((p_de_novo > 0.05) & (kid_ad_ratio > 0.2), hl.struct(p_de_novo=p_de_novo, confidence='LOW')).or_missing())) return hl.bind(solve, p_de_novo) de_novo_call = (hl.case().when(~het_hom_hom | kid_ad_fail, failure).when( autosomal, hl.bind(call_auto, kid_pp, dad_pp, mom_pp, kid_ad_ratio)).when( hemi_x | hemi_mt, hl.bind(call_hemi, kid_pp, mom, mom_pp, kid_ad_ratio)).when( hemi_y, hl.bind(call_hemi, kid_pp, dad, dad_pp, kid_ad_ratio)).or_missing()) tm = tm.annotate_entries(__call=de_novo_call) tm = tm.filter_entries(hl.is_defined(tm.__call)) entries = tm.entries() return (entries.select('__site_freq', 'proband', 'father', 'mother', 'proband_entry', 'father_entry', 'mother_entry', 'is_female', **entries.__call).rename({'__site_freq': 'prior'}))
def de_novo(mt: MatrixTable, pedigree: Pedigree, pop_frequency_prior, *, min_gq: int = 20, min_p: float = 0.05, max_parent_ab: float = 0.05, min_child_ab: float = 0.20, min_dp_ratio: float = 0.10) -> Table: r"""Call putative *de novo* events from trio data. .. include:: ../_templates/req_tstring.rst .. include:: ../_templates/req_tvariant.rst .. include:: ../_templates/req_biallelic.rst Examples -------- Call de novo events: >>> pedigree = hl.Pedigree.read('data/trios.fam') >>> priors = hl.import_table('data/gnomadFreq.tsv', impute=True) >>> priors = priors.transmute(**hl.parse_variant(priors.Variant)).key_by('locus', 'alleles') >>> de_novo_results = hl.de_novo(dataset, pedigree, pop_frequency_prior=priors[dataset.row_key].AF) Notes ----- This method assumes the GATK high-throughput sequencing fields exist: `GT`, `AD`, `DP`, `GQ`, `PL`. This method replicates the functionality of `Kaitlin Samocha's de novo caller <https://github.com/ksamocha/de_novo_scripts>`__. The version corresponding to git commit ``bde3e40`` is implemented in Hail with her permission and assistance. This method produces a :class:`.Table` with the following fields: - `locus` (``locus``) -- Variant locus. - `alleles` (``array<str>``) -- Variant alleles. - `id` (``str``) -- Proband sample ID. - `prior` (``float64``) -- Site frequency prior. It is the maximum of: the computed dataset alternate allele frequency, the `pop_frequency_prior` parameter, and the global prior ``1 / 3e7``. - `proband` (``struct``) -- Proband column fields from `mt`. - `father` (``struct``) -- Father column fields from `mt`. - `mother` (``struct``) -- Mother column fields from `mt`. - `proband_entry` (``struct``) -- Proband entry fields from `mt`. - `father_entry` (``struct``) -- Father entry fields from `mt`. - `proband_entry` (``struct``) -- Mother entry fields from `mt`. - `is_female` (``bool``) -- ``True`` if proband is female. - `p_de_novo` (``float64``) -- Unfiltered posterior probability that the event is *de novo* rather than a missed heterozygous event in a parent. - `confidence` (``str``) Validation confidence. One of: ``'HIGH'``, ``'MEDIUM'``, ``'LOW'``. The key of the table is ``['locus', 'alleles', 'id']``. The model looks for de novo events in which both parents are homozygous reference and the proband is a heterozygous. The model makes the simplifying assumption that when this configuration ``x = (AA, AA, AB)`` of calls occurs, exactly one of the following is true: - ``d``: a de novo mutation occurred in the proband and all calls are accurate. - ``m``: at least one parental allele is actually heterozygous and the proband call is accurate. We can then estimate the posterior probability of a de novo mutation as: .. math:: \mathrm{P_{\text{de novo}}} = \frac{\mathrm{P}(d\,|\,x)}{\mathrm{P}(d\,|\,x) + \mathrm{P}(m\,|\,x)} Applying Bayes rule to the numerator and denominator yields .. math:: \frac{\mathrm{P}(x\,|\,d)\,\mathrm{P}(d)}{\mathrm{P}(x\,|\,d)\,\mathrm{P}(d) + \mathrm{P}(x\,|\,m)\,\mathrm{P}(m)} The prior on de novo mutation is estimated from the rate in the literature: .. math:: \mathrm{P}(d) = \frac{1 \text{mutation}}{30,000,000\, \text{bases}} The prior used for at least one alternate allele between the parents depends on the alternate allele frequency: .. math:: \mathrm{P}(m) = 1 - (1 - AF)^4 The likelihoods :math:`\mathrm{P}(x\,|\,d)` and :math:`\mathrm{P}(x\,|\,m)` are computed from the PL (genotype likelihood) fields using these factorizations: .. math:: \mathrm{P}(x = (AA, AA, AB) \,|\,d) = \Big( &\mathrm{P}(x_{\mathrm{father}} = AA \,|\, \mathrm{father} = AA) \\ \cdot &\mathrm{P}(x_{\mathrm{mother}} = AA \,|\, \mathrm{mother} = AA) \\ \cdot &\mathrm{P}(x_{\mathrm{proband}} = AB \,|\, \mathrm{proband} = AB) \Big) .. math:: \mathrm{P}(x = (AA, AA, AB) \,|\,m) = \Big( & \mathrm{P}(x_{\mathrm{father}} = AA \,|\, \mathrm{father} = AB) \cdot \mathrm{P}(x_{\mathrm{mother}} = AA \,|\, \mathrm{mother} = AA) \\ + \, &\mathrm{P}(x_{\mathrm{father}} = AA \,|\, \mathrm{father} = AA) \cdot \mathrm{P}(x_{\mathrm{mother}} = AA \,|\, \mathrm{mother} = AB) \Big) \\ \cdot \, &\mathrm{P}(x_{\mathrm{proband}} = AB \,|\, \mathrm{proband} = AB) (Technically, the second factorization assumes there is exactly (rather than at least) one alternate allele among the parents, which may be justified on the grounds that it is typically the most likely case by far.) While this posterior probability is a good metric for grouping putative de novo mutations by validation likelihood, there exist error modes in high-throughput sequencing data that are not appropriately accounted for by the phred-scaled genotype likelihoods. To this end, a number of hard filters are applied in order to assign validation likelihood. These filters are different for SNPs and insertions/deletions. In the below rules, the following variables are used: - ``DR`` refers to the ratio of the read depth in the proband to the combined read depth in the parents. - ``AB`` refers to the read allele balance of the proband (number of alternate reads divided by total reads). - ``AC`` refers to the count of alternate alleles across all individuals in the dataset at the site. - ``p`` refers to :math:`\mathrm{P_{\text{de novo}}}`. - ``min_p`` refers to the ``min_p`` function parameter. HIGH-quality SNV: .. code-block:: text p > 0.99 && AB > 0.3 && DR > 0.2 or p > 0.99 && AB > 0.3 && AC == 1 MEDIUM-quality SNV: .. code-block:: text p > 0.5 && AB > 0.3 or p > 0.5 && AB > 0.2 && AC == 1 LOW-quality SNV: .. code-block:: text p > min_p && AB > 0.2 HIGH-quality indel: .. code-block:: text p > 0.99 && AB > 0.3 && DR > 0.2 or p > 0.99 && AB > 0.3 && AC == 1 MEDIUM-quality indel: .. code-block:: text p > 0.5 && AB > 0.3 or p > 0.5 && AB > 0.2 and AC == 1 LOW-quality indel: .. code-block:: text p > min_p && AB > 0.2 Additionally, de novo candidates are not considered if the proband GQ is smaller than the ``min_gq`` parameter, if the proband allele balance is lower than the ``min_child_ab`` parameter, if the depth ratio between the proband and parents is smaller than the ``min_depth_ratio`` parameter, or if the allele balance in a parent is above the ``max_parent_ab`` parameter. Parameters ---------- mt : :class:`.MatrixTable` High-throughput sequencing dataset. pedigree : :class:`.Pedigree` Sample pedigree. pop_frequency_prior : :class:`.Float64Expression` Expression for population alternate allele frequency prior. min_gq Minimum proband GQ to be considered for *de novo* calling. min_p Minimum posterior probability to be considered for *de novo* calling. max_parent_ab Maximum parent allele balance. min_child_ab Minimum proband allele balance/ min_dp_ratio Minimum ratio between proband read depth and parental read depth. Returns ------- :class:`.Table` """ DE_NOVO_PRIOR = 1 / 30000000 MIN_POP_PRIOR = 100 / 30000000 required_entry_fields = {'GT', 'AD', 'DP', 'GQ', 'PL'} missing_fields = required_entry_fields - set(mt.entry) if missing_fields: raise ValueError(f"'de_novo': expected 'MatrixTable' to have at least {required_entry_fields}, " f"missing {missing_fields}") mt = mt.annotate_rows(__prior=pop_frequency_prior, __alt_alleles=hl.agg.sum(mt.GT.n_alt_alleles()), __total_alleles=2 * hl.agg.sum(hl.is_defined(mt.GT))) # subtract 1 from __alt_alleles to correct for the observed genotype mt = mt.annotate_rows(__site_freq=hl.max((mt.__alt_alleles - 1) / mt.__total_alleles, mt.__prior, MIN_POP_PRIOR)) mt = require_biallelic(mt, 'de_novo') # FIXME check that __site_freq is between 0 and 1 when possible in expr tm = trio_matrix(mt, pedigree, complete_trios=True) autosomal = tm.locus.in_autosome_or_par() | (tm.locus.in_x_nonpar() & tm.is_female) hemi_x = tm.locus.in_x_nonpar() & ~tm.is_female hemi_y = tm.locus.in_y_nonpar() & ~tm.is_female hemi_mt = tm.locus.in_mito() & tm.is_female is_snp = hl.is_snp(tm.alleles[0], tm.alleles[1]) n_alt_alleles = tm.__alt_alleles prior = tm.__site_freq het_hom_hom = tm.proband_entry.GT.is_het() & tm.father_entry.GT.is_hom_ref() & tm.mother_entry.GT.is_hom_ref() kid_ad_fail = tm.proband_entry.AD[1] / hl.sum(tm.proband_entry.AD) < min_child_ab failure = hl.null(hl.tstruct(p_de_novo=hl.tfloat64, confidence=hl.tstr)) kid = tm.proband_entry dad = tm.father_entry mom = tm.mother_entry kid_linear_pl = 10 ** (-kid.PL / 10) kid_pp = hl.bind(lambda x: x / hl.sum(x), kid_linear_pl) dad_linear_pl = 10 ** (-dad.PL / 10) dad_pp = hl.bind(lambda x: x / hl.sum(x), dad_linear_pl) mom_linear_pl = 10 ** (-mom.PL / 10) mom_pp = hl.bind(lambda x: x / hl.sum(x), mom_linear_pl) kid_ad_ratio = kid.AD[1] / hl.sum(kid.AD) dp_ratio = kid.DP / (dad.DP + mom.DP) def call_auto(kid_pp, dad_pp, mom_pp, kid_ad_ratio): p_data_given_dn = dad_pp[0] * mom_pp[0] * kid_pp[1] * DE_NOVO_PRIOR p_het_in_parent = 1 - (1 - prior) ** 4 p_data_given_missed_het = (dad_pp[1] * mom_pp[0] + dad_pp[0] * mom_pp[1]) * kid_pp[1] * p_het_in_parent p_de_novo = p_data_given_dn / (p_data_given_dn + p_data_given_missed_het) def solve(p_de_novo): return ( hl.case() .when(kid.GQ < min_gq, failure) .when((kid.DP / (dad.DP + mom.DP) < min_dp_ratio) | ~(kid_ad_ratio >= min_child_ab), failure) .when((hl.sum(mom.AD) == 0) | (hl.sum(dad.AD) == 0), failure) .when((mom.AD[1] / hl.sum(mom.AD) > max_parent_ab) | (dad.AD[1] / hl.sum(dad.AD) > max_parent_ab), failure) .when(p_de_novo < min_p, failure) .when(~is_snp, hl.case() .when((p_de_novo > 0.99) & (kid_ad_ratio > 0.3) & (n_alt_alleles == 1), hl.struct(p_de_novo=p_de_novo, confidence='HIGH')) .when((p_de_novo > 0.5) & (kid_ad_ratio > 0.3) & (n_alt_alleles <= 5), hl.struct(p_de_novo=p_de_novo, confidence='MEDIUM')) .when((p_de_novo > 0.05) & (kid_ad_ratio > 0.2), hl.struct(p_de_novo=p_de_novo, confidence='LOW')) .or_missing()) .default(hl.case() .when(((p_de_novo > 0.99) & (kid_ad_ratio > 0.3) & (dp_ratio > 0.2)) | ((p_de_novo > 0.99) & (kid_ad_ratio > 0.3) & (n_alt_alleles == 1)) | ((p_de_novo > 0.5) & (kid_ad_ratio > 0.3) & (n_alt_alleles < 10) & (kid.DP > 10)), hl.struct(p_de_novo=p_de_novo, confidence='HIGH')) .when((p_de_novo > 0.5) & ((kid_ad_ratio > 0.3) | (n_alt_alleles == 1)), hl.struct(p_de_novo=p_de_novo, confidence='MEDIUM')) .when((p_de_novo > 0.05) & (kid_ad_ratio > 0.2), hl.struct(p_de_novo=p_de_novo, confidence='LOW')) .or_missing() ) ) return hl.bind(solve, p_de_novo) def call_hemi(kid_pp, parent, parent_pp, kid_ad_ratio): p_data_given_dn = parent_pp[0] * kid_pp[1] * DE_NOVO_PRIOR p_het_in_parent = 1 - (1 - prior) ** 4 p_data_given_missed_het = (parent_pp[1] + parent_pp[2]) * kid_pp[2] * p_het_in_parent p_de_novo = p_data_given_dn / (p_data_given_dn + p_data_given_missed_het) def solve(p_de_novo): return ( hl.case() .when(kid.GQ < min_gq, failure) .when((kid.DP / (parent.DP) < min_dp_ratio) | (kid_ad_ratio < min_child_ab), failure) .when((hl.sum(parent.AD) == 0), failure) .when(parent.AD[1] / hl.sum(parent.AD) > max_parent_ab, failure) .when(p_de_novo < min_p, failure) .when(~is_snp, hl.case() .when((p_de_novo > 0.99) & (kid_ad_ratio > 0.3) & (n_alt_alleles == 1), hl.struct(p_de_novo=p_de_novo, confidence='HIGH')) .when((p_de_novo > 0.5) & (kid_ad_ratio > 0.3) & (n_alt_alleles <= 5), hl.struct(p_de_novo=p_de_novo, confidence='MEDIUM')) .when((p_de_novo > 0.05) & (kid_ad_ratio > 0.3), hl.struct(p_de_novo=p_de_novo, confidence='LOW')) .or_missing()) .default(hl.case() .when(((p_de_novo > 0.99) & (kid_ad_ratio > 0.3) & (dp_ratio > 0.2)) | ((p_de_novo > 0.99) & (kid_ad_ratio > 0.3) & (n_alt_alleles == 1)) | ((p_de_novo > 0.5) & (kid_ad_ratio > 0.3) & (n_alt_alleles < 10) & (kid.DP > 10)), hl.struct(p_de_novo=p_de_novo, confidence='HIGH')) .when((p_de_novo > 0.5) & ((kid_ad_ratio > 0.3) | (n_alt_alleles == 1)), hl.struct(p_de_novo=p_de_novo, confidence='MEDIUM')) .when((p_de_novo > 0.05) & (kid_ad_ratio > 0.2), hl.struct(p_de_novo=p_de_novo, confidence='LOW')) .or_missing() ) ) return hl.bind(solve, p_de_novo) de_novo_call = ( hl.case() .when(~het_hom_hom | kid_ad_fail, failure) .when(autosomal, hl.bind(call_auto, kid_pp, dad_pp, mom_pp, kid_ad_ratio)) .when(hemi_x | hemi_mt, hl.bind(call_hemi, kid_pp, mom, mom_pp, kid_ad_ratio)) .when(hemi_y, hl.bind(call_hemi, kid_pp, dad, dad_pp, kid_ad_ratio)) .or_missing() ) tm = tm.annotate_entries(__call=de_novo_call) tm = tm.filter_entries(hl.is_defined(tm.__call)) entries = tm.entries() return (entries.select('__site_freq', 'proband', 'father', 'mother', 'proband_entry', 'father_entry', 'mother_entry', 'is_female', **entries.__call) .rename({'__site_freq': 'prior'}))
def unpersist_matrix_table(self, mt): return MatrixTable._from_java(self._to_java_ir(mt._mir).pyUnpersist())
def from_matrix_table( mt: MatrixTable, entry_field: str, *, n_partitions: Optional[int] = None, block_size: Optional[int] = None, sort_columns: bool = False ) -> 'DNDArray': if n_partitions is None: n_partitions = mt.n_partitions() if block_size is None: block_size = DNDArray.default_block_size if n_partitions == 0: assert mt.count_cols() == 0 assert mt.count_rows() == 0 t = range_table(0, 0) t = t.annotate(r=0, c=0, block=nd.array([]).reshape((0, 0))) t = t.select_globals( n_rows=0, n_cols=0, n_block_rows=0, n_block_cols=0, block_size=0) return DNDArray(t) assert 'r' not in mt.row assert 'c' not in mt.row assert 'block' not in mt.row n_rows, n_cols = mt.count() n_block_rows = (n_rows + block_size - 1) // block_size n_block_cols = (n_cols + block_size - 1) // block_size entries, cols, row_index, col_blocks = (Env.get_uid() for _ in range(4)) if sort_columns: col_index = Env.get_uid() col_order = mt.add_col_index(col_index) col_order = col_order.key_cols_by().cols() col_order = col_order.select(key=col_order.row.select(*mt.col_key), index=col_order[col_index]) col_order = col_order.collect(_localize=False) col_order = hl.sorted(col_order, key=lambda x: x.key) col_order = col_order['index'].collect()[0] mt = mt.choose_cols(col_order) else: col_keys = mt.col_key.collect(_localize=False) out_of_order = hl.range(hl.len(col_keys) - 1).map( lambda i: col_keys[i] > col_keys[i + 1]) out_of_order = out_of_order.collect()[0] if any(out_of_order): raise ValueError( 'from_matrix_table: columns are not in sorted order. You may request a ' 'sort with sort_columns=True.') mt = (mt .select_globals() .select_rows() .select_cols() .add_row_index(row_index) .localize_entries(entries, cols)) # FIXME: remove when ndarray support structs mt = mt.annotate(**{entries: mt[entries][entry_field]}) mt = mt.annotate( **{col_blocks: hl.range(n_block_cols).map( lambda c: hl.struct( c=c, entries=mt[entries][(c * block_size):((c + 1) * block_size)]))} ) mt = mt.explode(col_blocks) mt = mt.select(row_index, **mt[col_blocks]) mt = mt.annotate(r=hl.int(mt[row_index] // block_size)) mt = mt.key_by(mt.r, mt.c) mt = mt.group_by(mt.r, mt.c).aggregate( entries=hl.sorted( hl.agg.collect(hl.struct(row_index=mt[row_index], entries=mt.entries)), key=lambda x: x.row_index ).map(lambda x: x.entries)) mt = mt.select(block=hl.nd.array(mt.entries)) mt = mt.select_globals( n_rows=n_rows, n_cols=n_cols, n_block_rows=n_block_rows, n_block_cols=n_block_cols, block_size=block_size) fname = new_temp_file() mt = mt.key_by(mt.r, mt.c) mt.write(fname, _codec_spec=DNDArray.fast_codec_spec) t = hl.read_table(fname, _intervals=[ hl.Interval(hl.Struct(r=i, c=j), hl.Struct(r=i, c=j + 1)) for i in range(n_block_rows) for j in range(n_block_cols)]) return DNDArray(t)
def unpersist_matrix_table(self, mt): return MatrixTable._from_java(mt._jmt.unpersist())
def filter_intervals(ds, intervals, keep=True) -> Union[Table, MatrixTable]: """Filter rows with a list of intervals. Examples -------- Filter to loci falling within one interval: >>> ds_result = hl.filter_intervals(dataset, [hl.parse_locus_interval('17:38449840-38530994')]) Remove all loci within list of intervals: >>> intervals = [hl.parse_locus_interval(x) for x in ['1:50M-75M', '2:START-400000', '3-22']] >>> ds_result = hl.filter_intervals(dataset, intervals, keep=False) Notes ----- Based on the ``keep`` argument, this method will either restrict to points in the supplied interval ranges, or remove all rows in those ranges. When ``keep=True``, partitions that don't overlap any supplied interval will not be loaded at all. This enables :func:`.filter_intervals` to be used for reasonably low-latency queries of small ranges of the dataset, even on large datasets. Parameters ---------- ds : :class:`.MatrixTable` or :class:`.Table` Dataset to filter. intervals : :class:`.ArrayExpression` of type :py:data:`.tinterval` Intervals to filter on. The point type of the interval must be a prefix of the partition key (when filtering a matrix table) or the key (when filtering a table), or equal to the first field of the key. keep : :obj:`bool` If ``True``, keep only rows that fall within any interval in `intervals`. If ``False``, keep only rows that fall outside all intervals in `intervals`. Returns ------- :class:`.MatrixTable` or :class:`.Table` """ if isinstance(ds, MatrixTable): n_pk = len(ds.partition_key) pk_type = ds.partition_key.dtype else: assert isinstance(ds, Table) if ds.key is None: raise TypeError("cannot filter intervals of an unkeyed Table") n_pk = len(ds.key) pk_type = ds.key.dtype point_type = intervals.dtype.element_type.point_type def is_struct_prefix(partial, full): if list(partial) != list(full)[:len(partial)]: return False for k, v in partial.items(): if full[k] != v: return False return True if point_type == pk_type[0]: needs_wrapper = True elif isinstance(point_type, tstruct) and is_struct_prefix( point_type, pk_type): needs_wrapper = False else: raise TypeError( "The point type is incompatible with key type of the dataset ('{}', '{}')" .format(repr(point_type), repr(pk_type))) def wrap_input(interval): if interval is None: raise TypeError( "'filter_intervals' does not allow missing values in 'intervals'." ) elif needs_wrapper: return Interval(Struct(foo=interval.start), Struct(foo=interval.end), interval.includes_start, interval.includes_end) else: return interval intervals = [wrap_input(x)._jrep for x in intervals.value] if isinstance(ds, MatrixTable): jmt = Env.hail().methods.MatrixFilterIntervals.apply( ds._jvds, intervals, keep) return MatrixTable(jmt) else: jt = Env.hail().methods.TableFilterIntervals.apply( ds._jt, intervals, keep) return Table(jt)
def persist_matrix_table(self, mt, storage_level): return MatrixTable._from_java( self._jbackend.pyPersistMatrix(storage_level, self._to_java_matrix_ir(mt._mir)))
def unpersist_matrix_table(self, mt, storage_level): return MatrixTable._from_java(mt._jmt.unpersist())
def filter_intervals(ds, intervals, keep=True) -> Union[Table, MatrixTable]: """Filter rows with a list of intervals. Examples -------- Filter to loci falling within one interval: >>> ds_result = hl.filter_intervals(dataset, [hl.parse_locus_interval('17:38449840-38530994')]) Remove all loci within list of intervals: >>> intervals = [hl.parse_locus_interval(x) for x in ['1:50M-75M', '2:START-400000', '3-22']] >>> ds_result = hl.filter_intervals(dataset, intervals, keep=False) Notes ----- Based on the ``keep`` argument, this method will either restrict to points in the supplied interval ranges, or remove all rows in those ranges. When ``keep=True``, partitions that don't overlap any supplied interval will not be loaded at all. This enables :func:`.filter_intervals` to be used for reasonably low-latency queries of small ranges of the dataset, even on large datasets. Parameters ---------- ds : :class:`.MatrixTable` or :class:`.Table` Dataset to filter. intervals : :class:`.ArrayExpression` of type :py:data:`.tinterval` Intervals to filter on. The point type of the interval must be a prefix of the key or equal to the first field of the key. keep : :obj:`bool` If ``True``, keep only rows that fall within any interval in `intervals`. If ``False``, keep only rows that fall outside all intervals in `intervals`. Returns ------- :class:`.MatrixTable` or :class:`.Table` """ if isinstance(ds, MatrixTable): k_type = ds.row_key.dtype else: assert isinstance(ds, Table) k_type = ds.key.dtype point_type = intervals.dtype.element_type.point_type def is_struct_prefix(partial, full): if list(partial) != list(full)[:len(partial)]: return False for k, v in partial.items(): if full[k] != v: return False return True if point_type == k_type[0]: needs_wrapper = True point_type = hl.tstruct(foo=point_type) elif isinstance(point_type, tstruct) and is_struct_prefix( point_type, k_type): needs_wrapper = False else: raise TypeError( "The point type is incompatible with key type of the dataset ('{}', '{}')" .format(repr(point_type), repr(k_type))) def wrap_input(interval): if interval is None: raise TypeError( "'filter_intervals' does not allow missing values in 'intervals'." ) elif needs_wrapper: return Interval(Struct(foo=interval.start), Struct(foo=interval.end), interval.includes_start, interval.includes_end) else: return interval intervals_type = intervals.dtype intervals = hl.eval(intervals) intervals = hl.tarray(hl.tinterval(point_type))._convert_to_json( [wrap_input(i) for i in intervals]) if isinstance(ds, MatrixTable): config = { 'name': 'MatrixFilterIntervals', 'keyType': point_type._parsable_string(), 'intervals': intervals, 'keep': keep } return MatrixTable(MatrixToMatrixApply(ds._mir, config)) else: config = { 'name': 'TableFilterIntervals', 'keyType': point_type._parsable_string(), 'intervals': intervals, 'keep': keep } return Table(TableToTableApply(ds._tir, config))
def require_biallelic(dataset, method) -> MatrixTable: require_row_key_variant(dataset, method) dataset = MatrixTable(Env.hail().methods.VerifyBiallelic.apply( dataset._jvds, method)) return dataset
def vep(dataset, config, block_size=1000, name='vep', csq=False) -> MatrixTable: """Annotate variants with VEP. .. include:: ../_templates/req_tvariant.rst :func:`.vep` runs `Variant Effect Predictor <http://www.ensembl.org/info/docs/tools/vep/index.html>`__ with the `LOFTEE plugin <https://github.com/konradjk/loftee>`__ on the current dataset and adds the result as a row field. Examples -------- Add VEP annotations to the dataset: >>> result = hl.vep(dataset, "data/vep.properties") # doctest: +SKIP Notes ----- **Configuration** :func:`.vep` needs a configuration file to tell it how to run VEP. The format is a `.properties file <https://en.wikipedia.org/wiki/.properties>`__. Roughly, each line defines a property as a key-value pair of the form `key = value`. :func:`.vep` supports the following properties: - **hail.vep.perl** -- Location of Perl. Optional, default: perl. - **hail.vep.perl5lib** -- Value for the PERL5LIB environment variable when invoking VEP. Optional, by default PERL5LIB is not set. - **hail.vep.path** -- Value of the PATH environment variable when invoking VEP. Optional, by default PATH is not set. - **hail.vep.location** -- Location of the VEP Perl script. Required. - **hail.vep.cache_dir** -- Location of the VEP cache dir, passed to VEP with the ``--dir`` option. Required. - **hail.vep.fasta** -- Location of the FASTA file to use to look up the reference sequence, passed to VEP with the `--fasta` option. Required. - **hail.vep.assembly** -- Genome assembly version to use. Optional, default: GRCh37 - **hail.vep.plugin** -- VEP plugin, passed to VEP with the `--plugin` option. Optional. Overrides `hail.vep.lof.human_ancestor` and `hail.vep.lof.conservation_file`. - **hail.vep.lof.human_ancestor** -- Location of the human ancestor file for the LOFTEE plugin. Ignored if `hail.vep.plugin` is set. Required otherwise. - **hail.vep.lof.conservation_file** -- Location of the conservation file for the LOFTEE plugin. Ignored if `hail.vep.plugin` is set. Required otherwise. Here is an example ``vep.properties`` configuration file .. code-block:: text hail.vep.perl = /usr/bin/perl hail.vep.path = /usr/local/bin:/usr/bin:/bin:/usr/sbin:/sbin hail.vep.location = /path/to/vep/ensembl-tools-release-81/scripts/variant_effect_predictor/variant_effect_predictor.pl hail.vep.cache_dir = /path/to/vep hail.vep.lof.human_ancestor = /path/to/loftee_data/human_ancestor.fa.gz hail.vep.lof.conservation_file = /path/to/loftee_data/phylocsf.sql **VEP Invocation** .. code-block:: text <hail.vep.perl> <hail.vep.location> --format vcf --json --everything --allele_number --no_stats --cache --offline --dir <hail.vep.cache_dir> --fasta <hail.vep.fasta> --minimal --assembly <hail.vep.assembly> --plugin LoF,\ human_ancestor_fa:$<hail.vep.lof.human_ancestor>,\ filter_position:0.05,\ min_intron_size:15,\ conservation_file:<hail.vep.lof.conservation_file> -o STDOUT **Annotations** A new row field is added in the location specified by `name` with the following schema: .. code-block:: text struct { assembly_name: str, allele_string: str, ancestral: str, colocated_variants: array<struct { aa_allele: str, aa_maf: float64, afr_allele: str, afr_maf: float64, allele_string: str, amr_allele: str, amr_maf: float64, clin_sig: array<str>, end: int32, eas_allele: str, eas_maf: float64, ea_allele: str, ea_maf: float64, eur_allele: str, eur_maf: float64, exac_adj_allele: str, exac_adj_maf: float64, exac_allele: str, exac_afr_allele: str, exac_afr_maf: float64, exac_amr_allele: str, exac_amr_maf: float64, exac_eas_allele: str, exac_eas_maf: float64, exac_fin_allele: str, exac_fin_maf: float64, exac_maf: float64, exac_nfe_allele: str, exac_nfe_maf: float64, exac_oth_allele: str, exac_oth_maf: float64, exac_sas_allele: str, exac_sas_maf: float64, id: str, minor_allele: str, minor_allele_freq: float64, phenotype_or_disease: int32, pubmed: array<int32>, sas_allele: str, sas_maf: float64, somatic: int32, start: int32, strand: int32 }>, context: str, end: int32, id: str, input: str, intergenic_consequences: array<struct { allele_num: int32, consequence_terms: array<str>, impact: str, minimised: int32, variant_allele: str }>, most_severe_consequence: str, motif_feature_consequences: array<struct { allele_num: int32, consequence_terms: array<str>, high_inf_pos: str, impact: str, minimised: int32, motif_feature_id: str, motif_name: str, motif_pos: int32, motif_score_change: float64, strand: int32, variant_allele: str }>, regulatory_feature_consequences: array<struct { allele_num: int32, biotype: str, consequence_terms: array<str>, impact: str, minimised: int32, regulatory_feature_id: str, variant_allele: str }>, seq_region_name: str, start: int32, strand: int32, transcript_consequences: array<struct { allele_num: int32, amino_acids: str, biotype: str, canonical: int32, ccds: str, cdna_start: int32, cdna_end: int32, cds_end: int32, cds_start: int32, codons: str, consequence_terms: array<str>, distance: int32, domains: array<struct { db: str, name: str }>, exon: str, gene_id: str, gene_pheno: int32, gene_symbol: str, gene_symbol_source: str, hgnc_id: str, hgvsc: str, hgvsp: str, hgvs_offset: int32, impact: str, intron: str, lof: str, lof_flags: str, lof_filter: str, lof_info: str, minimised: int32, polyphen_prediction: str, polyphen_score: float64, protein_end: int32, protein_start: int32, protein_id: str, sift_prediction: str, sift_score: float64, strand: int32, swissprot: str, transcript_id: str, trembl: str, uniparc: str, variant_allele: str }>, variant_class: str } Parameters ---------- dataset : :class:`.MatrixTable` Dataset. config : :obj:`str` Path to VEP configuration file. block_size : :obj:`int` Number of rows to process per VEP invocation. name : :obj:`str` Name for resulting row field. csq : :obj:`bool` If ``True``, annotates VCF CSQ field as a :py:data:`.tstr`. If ``False``, annotates with the full nested struct schema. Returns ------- :class:`.MatrixTable` Dataset with new row-indexed field `name` containing VEP annotations. """ require_row_key_variant(dataset, 'vep') mt = MatrixTable(Env.hail().methods.VEP.apply(dataset._jvds, config, 'va.`{}`'.format(name), csq, block_size)) return mt.annotate_rows(vep=mt['vep']['vep'])
def filter_intervals(ds, intervals, keep=True) -> MatrixTable: """Filter rows with a list of intervals. Examples -------- Filter to loci falling within one interval: >>> ds_result = hl.filter_intervals(dataset, [hl.parse_locus_interval('17:38449840-38530994')]) Remove all loci within list of intervals: >>> intervals = [hl.parse_locus_interval(x) for x in ['1:50M-75M', '2:START-400000', '3-22']] >>> ds_result = hl.filter_intervals(dataset, intervals) Notes ----- Based on the ``keep`` argument, this method will either restrict to points in the supplied interval ranges, or remove all rows in those ranges. When ``keep=True``, partitions that don't overlap any supplied interval will not be loaded at all. This enables :func:`.filter_intervals` to be used for reasonably low-latency queries of small ranges of the dataset, even on large datasets. Parameters ---------- ds : :class:`.MatrixTable` Dataset. intervals : :class:`.ArrayExpression` of type :py:data:`.tinterval` Intervals to filter on. If there is only one row partition key, the point type of the interval can be the type of the first partition key. Otherwise, the interval point type must be a :class:`.Struct` matching the row partition key schema. keep : :obj:`bool` If ``True``, keep only rows that fall within any interval in `intervals`. If ``False``, keep only rows that fall outside all intervals in `intervals`. Returns ------- :class:`.MatrixTable` """ n_pk = len(ds.partition_key) pk_type = ds.partition_key.dtype point_type = intervals.dtype.element_type.point_type if point_type == pk_type: needs_wrapper = False elif n_pk == 1 and point_type == ds.partition_key[0].dtype: needs_wrapper = True else: raise TypeError( "The point type does not match the row partition key type of the dataset ('{}', '{}')" .format(repr(point_type), repr(pk_type))) def wrap_input(interval): if interval is None: raise TypeError( "'filter_intervals' does not allow missing values in 'intervals'." ) elif needs_wrapper: return Interval(Struct(foo=interval.start), Struct(foo=interval.end), interval.includes_start, interval.includes_end) else: return interval intervals = [wrap_input(x)._jrep for x in intervals.value] jmt = Env.hail().methods.FilterIntervals.apply(ds._jvds, intervals, keep) return MatrixTable(jmt)
def nirvana(dataset, config, block_size=500000, name='nirvana') -> MatrixTable: """Annotate variants using `Nirvana <https://github.com/Illumina/Nirvana>`_. .. include:: ../_templates/experimental.rst .. include:: ../_templates/req_tvariant.rst :func:`.nirvana` runs `Nirvana <https://github.com/Illumina/Nirvana>`_ on the current dataset and adds a new row field in the location specified by `name`. Examples -------- Add Nirvana annotations to the dataset: >>> result = hl.nirvana(dataset, "data/nirvana.properties") # doctest: +SKIP Notes ----- ***Configuration*** :func:`.nirvana` requires a configuration file. The format is a `.properties file <https://en.wikipedia.org/wiki/.properties>`__, where each line defines a property as a key-value pair of the form ``key = value``. :func:`.nirvana` supports the following properties: - **hail.nirvana.dotnet** -- Location of dotnet. Optional, default: dotnet. - **hail.nirvana.path** -- Value of the PATH environment variable when invoking Nirvana. Optional, by default PATH is not set. - **hail.nirvana.location** -- Location of Nirvana.dll. Required. - **hail.nirvana.reference** -- Location of reference genome. Required. - **hail.nirvana.cache** -- Location of cache. Required. - **hail.nirvana.supplementaryAnnotationDirectory** -- Location of Supplementary Database. Optional, no supplementary database by default. Here is an example ``nirvana.properties`` configuration file: .. code-block:: text hail.nirvana.location = /path/to/dotnet/netcoreapp1.1/Nirvana.dll hail.nirvana.reference = /path/to/nirvana/References/Homo_sapiens.GRCh37.Nirvana.dat hail.nirvana.cache = /path/to/nirvana/Cache/GRCh37/Ensembl84 hail.nirvana.supplementaryAnnotationDirectory = /path/to/nirvana/SupplementaryDatabase/GRCh37 **Annotations** A new row field is added in the location specified by `name` with the following schema: .. code-block:: text struct { chromosome: str, refAllele: str, position: int32, altAlleles: array<str>, cytogeneticBand: str, quality: float64, filters: array<str>, jointSomaticNormalQuality: int32, copyNumber: int32, strandBias: float64, recalibratedQuality: float64, variants: array<struct { altAllele: str, refAllele: str, chromosome: str, begin: int32, end: int32, phylopScore: float64, isReferenceMinor: bool, variantType: str, vid: str, isRecomposed: bool, regulatoryRegions: array<struct { id: str, consequence: set<str>, type: str }>, clinvar: array<struct { id: str, reviewStatus: str, isAlleleSpecific: bool, alleleOrigins: array<str>, refAllele: str, altAllele: str, phenotypes: array<str>, medGenIds: array<str>, omimIds: array<str>, orphanetIds: array<str>, geneReviewsId: str, significance: str, lastUpdatedDate: str, pubMedIds: array<str> }>, cosmic: array<struct { id: str, isAlleleSpecific: bool, refAllele: str, altAllele: str, gene: str, sampleCount: int32, studies: array<struct { id: int32, histology: str, primarySite: str }> }>, dbsnp: struct { ids: array<str> }, evs: struct { coverage: int32, sampleCount: int32, allAf: float64, afrAf: float64, eurAf: float64 }, exac: struct { coverage: int32, allAf: float64, allAc: int32, allAn: int32, afrAf: float64, afrAc: int32, afrAn: int32, amrAf: float64, amrAc: int32, amrAn: int32, easAf: float64, easAc: int32, easAn: int32, finAf: float64, finAc: int32, finAn: int32, nfeAf: float64, nfeAc: int32, nfeAn: int32, othAf: float64, othAc: int32, othAn: int32, sasAf: float64, sasAc: int32, sasAn: int32 }, globalAllele: struct { globalMinorAllele: str, globalMinorAlleleFrequency: float64 }, oneKg: struct { ancestralAllele: str, allAf: float64, allAc: int32, allAn: int32, afrAf: float64, afrAc: int32, afrAn: int32, amrAf: float64, amrAc: int32, amrAn: int32, easAf: float64, easAc: int32, easAn: int32, eurAf: float64, eurAc: int32, eurAn: int32, sasAf: float64, sasAc: int32, sasAn: int32 }, transcripts: struct { refSeq: array<struct { transcript: str, bioType: str, aminoAcids: str, cDnaPos: str, codons: str, cdsPos: str, exons: str, introns: str, geneId: str, hgnc: str, consequence: array<str>, hgvsc: str, hgvsp: str, isCanonical: bool, polyPhenScore: float64, polyPhenPrediction: str, proteinId: str, proteinPos: str, siftScore: float64, siftPrediction: str }>, ensembl: array<struct { transcript: str, bioType: str, aminoAcids: str, cDnaPos: str, codons: str, cdsPos: str, exons: str, introns: str, geneId: str, hgnc: str, consequence: array<str>, hgvsc: str, hgvsp: str, isCanonical: bool, polyPhenScore: float64, polyPhenPrediction: str, proteinId: str, proteinPos: str, siftScore: float64, siftPrediction: str }> }, genes: array<struct { name: str, omim: array<struct { mimNumber: int32, hgnc: str, description: str, phenotypes: array<struct { mimNumber: int32, phenotype: str, mapping: str, inheritance: array<str>, comments: str }> }> }> }> } Parameters ---------- dataset : :class:`.MatrixTable` Dataset. config : :obj:`str` Path to Nirvana configuration file. block_size : :obj:`int` Number of rows to process per Nirvana invocation. name : :obj:`str` Name for resulting row field. Returns ------- :class:`.MatrixTable` Dataset with new row-indexed field `name` containing Nirvana annotations. """ require_row_key_variant(dataset, 'nirvana') mt = MatrixTable(Env.hail().methods.Nirvana.apply(dataset._jvds, config, block_size, 'va.`{}`'.format(name))) return mt.annotate_rows(nirvana=mt['nirvana']['nirvana'])
def filter_intervals(ds, intervals, keep=True) -> Union[Table, MatrixTable]: """Filter rows with a list of intervals. Examples -------- Filter to loci falling within one interval: >>> ds_result = hl.filter_intervals(dataset, [hl.parse_locus_interval('17:38449840-38530994')]) Remove all loci within list of intervals: >>> intervals = [hl.parse_locus_interval(x) for x in ['1:50M-75M', '2:START-400000', '3-22']] >>> ds_result = hl.filter_intervals(dataset, intervals, keep=False) Notes ----- Based on the ``keep`` argument, this method will either restrict to points in the supplied interval ranges, or remove all rows in those ranges. When ``keep=True``, partitions that don't overlap any supplied interval will not be loaded at all. This enables :func:`.filter_intervals` to be used for reasonably low-latency queries of small ranges of the dataset, even on large datasets. Parameters ---------- ds : :class:`.MatrixTable` or :class:`.Table` Dataset to filter. intervals : :class:`.ArrayExpression` of type :py:data:`.tinterval` Intervals to filter on. The point type of the interval must be a prefix of the key or equal to the first field of the key. keep : :obj:`bool` If ``True``, keep only rows that fall within any interval in `intervals`. If ``False``, keep only rows that fall outside all intervals in `intervals`. Returns ------- :class:`.MatrixTable` or :class:`.Table` """ if isinstance(ds, MatrixTable): k_type = ds.row_key.dtype else: assert isinstance(ds, Table) k_type = ds.key.dtype point_type = intervals.dtype.element_type.point_type def is_struct_prefix(partial, full): if list(partial) != list(full)[:len(partial)]: return False for k, v in partial.items(): if full[k] != v: return False return True if point_type == k_type[0]: needs_wrapper = True elif isinstance(point_type, tstruct) and is_struct_prefix(point_type, k_type): needs_wrapper = False else: raise TypeError("The point type is incompatible with key type of the dataset ('{}', '{}')".format(repr(point_type), repr(k_type))) def wrap_input(interval): if interval is None: raise TypeError("'filter_intervals' does not allow missing values in 'intervals'.") elif needs_wrapper: return Interval(Struct(foo=interval.start), Struct(foo=interval.end), interval.includes_start, interval.includes_end) else: return interval intervals = [wrap_input(x)._jrep for x in hl.eval(intervals)] if isinstance(ds, MatrixTable): jmt = Env.hail().methods.MatrixFilterIntervals.apply(ds._jmt, intervals, keep) return MatrixTable._from_java(jmt) else: jt = Env.hail().methods.TableFilterIntervals.apply(ds._jt, intervals, keep) return Table._from_java(jt)