コード例 #1
0
def transform_one(mt: MatrixTable) -> MatrixTable:
    """transforms a gvcf into a form suitable for combining"""
    mt = mt.annotate_entries(
        # local (alt) allele index into global (alt) alleles
        LA=hl.range(0,
                    hl.len(mt.alleles) - 1),
        END=mt.info.END,
        PL=mt['PL'][0:],
        BaseQRankSum=mt.info['BaseQRankSum'],
        ClippingRankSum=mt.info['ClippingRankSum'],
        MQ=mt.info['MQ'],
        MQRankSum=mt.info['MQRankSum'],
        ReadPosRankSum=mt.info['ReadPosRankSum'],
    )
    mt = mt.annotate_rows(info=mt.info.annotate(
        DP=hl.agg.sum(mt.entry.DP),
        SB=hl.agg.array_sum(mt.entry.SB),
    ).select(
        "DP",
        "MQ_DP",
        "QUALapprox",
        "RAW_MQ",
        "VarDP",
        "SB",
    ))
    mt = mt.drop('SB', 'qual')

    return mt
コード例 #2
0
ファイル: vcf_combiner.py プロジェクト: atgenomix/hail
def transform_one(mt: MatrixTable) -> MatrixTable:
    """transforms a gvcf into a form suitable for combining"""
    mt = mt.annotate_entries(
        # local (alt) allele index into global (alt) alleles
        LA=hl.range(0, hl.len(mt.alleles)),
        END=mt.info.END,
        BaseQRankSum=mt.info['BaseQRankSum'],
        ClippingRankSum=mt.info['ClippingRankSum'],
        MQ=mt.info['MQ'],
        MQRankSum=mt.info['MQRankSum'],
        ReadPosRankSum=mt.info['ReadPosRankSum'],
    )
    mt = mt.annotate_rows(
        info=mt.info.annotate(
            SB_TABLE=hl.array([
                hl.agg.sum(mt.entry.SB[0]),
                hl.agg.sum(mt.entry.SB[1]),
                hl.agg.sum(mt.entry.SB[2]),
                hl.agg.sum(mt.entry.SB[3]),
            ])
        ).select(
            "MQ_DP",
            "QUALapprox",
            "RAW_MQ",
            "VarDP",
            "SB_TABLE",
        ))
    mt = mt.transmute_entries(
        LGT=mt.GT,
        LAD=mt.AD[0:],  # requiredness issues :'(
        LPL=mt.PL[0:],
        LPGT=mt.PGT)
    mt = mt.drop('SB', 'qual', 'filters')

    return mt
コード例 #3
0
def rename_duplicates(dataset, name='unique_id') -> MatrixTable:
    """Rename duplicate column keys.

    .. include:: ../_templates/req_tstring.rst

    Examples
    --------

    >>> renamed = hl.rename_duplicates(dataset).cols()
    >>> duplicate_samples = (renamed.filter(renamed.s != renamed.unique_id)
    ...                             .select()
    ...                             .collect())

    Notes
    -----

    This method produces a new column field from the string column key by
    appending a unique suffix ``_N`` as necessary. For example, if the column
    key "NA12878" appears three times in the dataset, the first will produce
    "NA12878", the second will produce "NA12878_1", and the third will produce
    "NA12878_2". The name of this new field is parameterized by `name`.

    Parameters
    ----------
    dataset : :class:`.MatrixTable`
        Dataset.
    name : :obj:`str`
        Name of new field.

    Returns
    -------
    :class:`.MatrixTable`
    """

    return MatrixTable(dataset._jvds.renameDuplicates(name))
コード例 #4
0
ファイル: misc.py プロジェクト: danking/hail
def rename_duplicates(dataset, name='unique_id') -> MatrixTable:
    """Rename duplicate column keys.

    .. include:: ../_templates/req_tstring.rst

    Examples
    --------

    >>> renamed = hl.rename_duplicates(dataset).cols()
    >>> duplicate_samples = (renamed.filter(renamed.s != renamed.unique_id)
    ...                             .select()
    ...                             .collect())

    Notes
    -----

    This method produces a new column field from the string column key by
    appending a unique suffix ``_N`` as necessary. For example, if the column
    key "NA12878" appears three times in the dataset, the first will produce
    "NA12878", the second will produce "NA12878_1", and the third will produce
    "NA12878_2". The name of this new field is parameterized by `name`.

    Parameters
    ----------
    dataset : :class:`.MatrixTable`
        Dataset.
    name : :obj:`str`
        Name of new field.

    Returns
    -------
    :class:`.MatrixTable`
    """

    return MatrixTable._from_java(dataset._jmt.renameDuplicates(name))
コード例 #5
0
    def read_multiple_matrix_tables(self, paths: 'List[str]', intervals: 'List[hl.Interval]', intervals_type):
        json_repr = {
            'paths': paths,
            'intervals': intervals_type._convert_to_json(intervals),
            'intervalPointType': intervals_type.element_type.point_type._parsable_string(),
        }

        results = self._jhc.backend().pyReadMultipleMatrixTables(json.dumps(json_repr))
        return [MatrixTable._from_java(jm) for jm in results]
コード例 #6
0
def window_by_locus(mt: MatrixTable, bp_window_size: int) -> MatrixTable:
    """Collect arrays of row and entry values from preceding loci.

    .. include:: ../_templates/req_tlocus.rst

    .. include:: ../_templates/experimental.rst

    Examples
    --------
    >>> ds_result = hl.window_by_locus(ds, 3)

    Notes
    -----
    This method groups each row (variant) with the previous rows in a window of
    `bp_window_size` base pairs, putting the row values from the previous
    variants into `prev_rows` (row field of type ``array<struct>``) and entry
    values from those variants into `prev_entries` (entry field of type
    ``array<struct>``).

    The `bp_window_size` argument is inclusive; if `base_pairs` is 2 and the
    loci are

    .. code-block:: text

        1:100
        1:100
        1:102
        1:102
        1:103
        2:100
        2:101

    then the size of `prev_rows` is 0, 1, 2, 3, 2, 0, and 1, respectively (and
    same for the size of prev_entries).

    Parameters
    ----------
    mt : :class:`.MatrixTable`
        Input dataset.
    bp_window_size : :obj:`int`
        Base pairs to include in the backwards window (inclusive).

    Returns
    -------
    :class:`.MatrixTable`
    """
    require_first_key_field_locus(mt, 'window_by_locus')
    return MatrixTable(
        hl.ir.MatrixToMatrixApply(mt._mir, {
            'name': 'WindowByLocus',
            'basePairs': bp_window_size
        }))
コード例 #7
0
ファイル: family_methods.py プロジェクト: shulik7/hail
def trio_matrix(dataset, pedigree, complete_trios=False):
    """Builds and returns a matrix where columns correspond to trios and entries contain genotypes for the trio.

    .. include:: ../_templates/req_tstring.rst

    Examples
    --------

    Create a trio matrix:

    >>> pedigree = hl.Pedigree.read('data/case_control_study.fam')
    >>> trio_dataset = hl.trio_matrix(dataset, pedigree, complete_trios=True)

    Notes
    -----

    This method builds a new matrix table with one column per trio. If
    `complete_trios` is ``True``, then only trios that satisfy
    :meth:`.Trio.is_complete` are included. In this new dataset, the column
    identifiers are the sample IDs of the trio probands. The column fields and
    entries of the matrix are changed in the following ways:

    The new column fields consist of three structs (`proband`, `father`,
    `mother`), a Boolean field, and a string field:

    - proband.id** (:py:data:`.tstr`) - Proband sample ID, same as trio column key.
    - proband.fields** (:class:`.tstruct`) - Column fields on the proband.
    - father.id** (:py:data:`.tstr`) - Father sample ID.
    - father.fields** (:class:`.tstruct`) - Column fields on the father.
    - mother.id** (:py:data:`.tstr`) - Mother sample ID.
    - mother.fields** (:class:`.tstruct`) - Column fields on the mother.
    - is_female** (:py:data:`.tbool`) - Proband is female.
      ``True`` for female, ``False`` for male, missing if unknown.
    - **fam_id** (:py:data:`.tstr`) - Family ID.

    The new entry fields are:

    - **proband_entry** (:class:`.tstruct`) - Proband entry fields.
    - **father_entry** (:class:`.tstruct`) - Father entry fields.
    - **mother_entry** (:class:`.tstruct`) - Mother entry fields.

    Parameters
    ----------
    pedigree : :class:`.Pedigree`

    Returns
    -------
    :class:`.MatrixTable`
    """
    return MatrixTable(dataset._jvds.trioMatrix(pedigree._jrep,
                                                complete_trios))
コード例 #8
0
ファイル: vcf_combiner.py プロジェクト: bcajes/hail
def transform_one(mt: MatrixTable) -> MatrixTable:
    """transforms a gvcf into a form suitable for combining"""
    mt = mt.annotate_entries(
        # local (alt) allele index into global (alt) alleles
        LA=hl.range(0, hl.len(mt.alleles) - 1),
        END=mt.info.END,
        PL=mt['PL'][0:],
        BaseQRankSum=mt.info['BaseQRankSum'],
        ClippingRankSum=mt.info['ClippingRankSum'],
        MQ=mt.info['MQ'],
        MQRankSum=mt.info['MQRankSum'],
        ReadPosRankSum=mt.info['ReadPosRankSum'],
    )
    # This collects all fields with median combiners into arrays so we can calculate medians
    # when needed
    mt = mt.annotate_rows(
        # now minrep'ed (ref, alt) allele pairs
        alleles=hl.bind(lambda ref: mt.alleles[1:].map(lambda alt:
                                                       # minrep <NON_REF>
                                                       hl.struct(ref=hl.cond(alt == "<NON_REF>",
                                                                             ref[0:1],
                                                                             ref),
                                                                 alt=alt)),
                        mt.alleles[0]),
        info=mt.info.annotate(
            SB=hl.agg.array_sum(mt.entry.SB)
        ).select(
            "DP",
            "MQ_DP",
            "QUALapprox",
            "RAW_MQ",
            "VarDP",
            "SB",
        ))
    mt = mt.drop('SB', 'qual')

    return mt
コード例 #9
0
def transform_one(mt: MatrixTable) -> MatrixTable:
    """transforms a gvcf into a form suitable for combining"""
    mt = mt.annotate_entries(
        # local (alt) allele index into global (alt) alleles
        LA=hl.range(0,
                    hl.len(mt.alleles) - 1),
        END=mt.info.END,
        PL=mt['PL'][0:],
        BaseQRankSum=mt.info['BaseQRankSum'],
        ClippingRankSum=mt.info['ClippingRankSum'],
        MQ=mt.info['MQ'],
        MQRankSum=mt.info['MQRankSum'],
        ReadPosRankSum=mt.info['ReadPosRankSum'],
    )
    # This collects all fields with median combiners into arrays so we can calculate medians
    # when needed
    mt = mt.annotate_rows(
        # now minrep'ed (ref, alt) allele pairs
        alleles=hl.bind(
            lambda ref: mt.alleles[1:].map(
                lambda alt:
                # minrep <NON_REF>
                hl.struct(ref=hl.cond(alt == "<NON_REF>", ref[0:1], ref),
                          alt=alt)),
            mt.alleles[0]),
        info=mt.info.annotate(SB=hl.agg.array_sum(mt.entry.SB)).select(
            "DP",
            "MQ_DP",
            "QUALapprox",
            "RAW_MQ",
            "VarDP",
            "SB",
        ))
    mt = mt.drop('SB', 'qual')

    return mt
コード例 #10
0
ファイル: misc.py プロジェクト: danking/hail
def window_by_locus(mt: MatrixTable, bp_window_size: int) -> MatrixTable:
    """Collect arrays of row and entry values from preceding loci.

    .. include:: ../_templates/req_tlocus.rst

    .. include:: ../_templates/experimental.rst

    Examples
    --------
    >>> ds_result = hl.window_by_locus(ds, 3)

    Notes
    -----
    This method groups each row (variant) with the previous rows in a window of
    `bp_window_size` base pairs, putting the row values from the previous
    variants into `prev_rows` (row field of type ``array<struct>``) and entry
    values from those variants into `prev_entries` (entry field of type
    ``array<struct>``).

    The `bp_window_size` argument is inclusive; if `base_pairs` is 2 and the
    loci are

    .. code-block:: text

        1:100
        1:100
        1:102
        1:102
        1:103
        2:100
        2:101

    then the size of `prev_rows` is 0, 1, 2, 3, 2, 0, and 1, respectively (and
    same for the size of prev_entries).

    Parameters
    ----------
    mt : :class:`.MatrixTable`
        Input dataset.
    bp_window_size : :obj:`int`
        Base pairs to include in the backwards window (inclusive).

    Returns
    -------
    :class:`.MatrixTable`
    """
    require_first_key_field_locus(mt, 'window_by_locus')
    return MatrixTable._from_java(mt._jmt.windowVariants(bp_window_size))
コード例 #11
0
ファイル: family_methods.py プロジェクト: bcajes/hail
def trio_matrix(dataset, pedigree, complete_trios=False) -> MatrixTable:
    """Builds and returns a matrix where columns correspond to trios and entries contain genotypes for the trio.

    .. include:: ../_templates/req_tstring.rst

    Examples
    --------

    Create a trio matrix:

    >>> pedigree = hl.Pedigree.read('data/case_control_study.fam')
    >>> trio_dataset = hl.trio_matrix(dataset, pedigree, complete_trios=True)

    Notes
    -----

    This method builds a new matrix table with one column per trio. If
    `complete_trios` is ``True``, then only trios that satisfy
    :meth:`.Trio.is_complete` are included. In this new dataset, the column
    identifiers are the sample IDs of the trio probands. The column fields and
    entries of the matrix are changed in the following ways:

    The new column fields consist of three structs (`proband`, `father`,
    `mother`), a Boolean field, and a string field:

    - **proband** (:class:`.tstruct`) - Column fields on the proband.
    - **father** (:class:`.tstruct`) - Column fields on the father.
    - **mother** (:class:`.tstruct`) - Column fields on the mother.
    - **id** (:py:data:`.tstr`) - Column key for the proband.
    - **is_female** (:py:data:`.tbool`) - Proband is female.
      ``True`` for female, ``False`` for male, missing if unknown.
    - **fam_id** (:py:data:`.tstr`) - Family ID.

    The new entry fields are:

    - **proband_entry** (:class:`.tstruct`) - Proband entry fields.
    - **father_entry** (:class:`.tstruct`) - Father entry fields.
    - **mother_entry** (:class:`.tstruct`) - Mother entry fields.

    Parameters
    ----------
    pedigree : :class:`.Pedigree`

    Returns
    -------
    :class:`.MatrixTable`
    """
    return MatrixTable._from_java(dataset._jmt.trioMatrix(pedigree._jrep, complete_trios))
コード例 #12
0
def summarize_variants(mt: MatrixTable, show=True):
    """Summarize the variants present in a dataset and print the results.

    Examples
    --------
    >>> hl.summarize_variants(dataset)  # doctest: +SKIP
    ==============================
    Number of variants: 346
    ==============================
    Alleles per variant
    -------------------
      2 alleles: 346 variants
    ==============================
    Variants per contig
    -------------------
      20: 346 variants
    ==============================
    Allele type distribution
    ------------------------
            SNP: 301 alleles
       Deletion: 27 alleles
      Insertion: 18 alleles
    ==============================

    Parameters
    ----------
    mt : :class:`.MatrixTable`
        Matrix table with a variant (locus / alleles) row key.
    show : :obj:`bool`
        If ``True``, print results instead of returning them.

    Notes
    -----
    The result returned if `show` is ``False`` is a  :class:`.Struct` with
    four fields:

    - `n_variants` (:obj:`int`): Number of variants present in the matrix table.
    - `allele_types` (:obj:`Dict[str, int]`): Number of alternate alleles in
      each allele allele category.
    - `contigs` (:obj:`Dict[str, int]`): Number of variants on each contig.
    - `allele_counts` (:obj:`Dict[int, int]`): Number of variants broken down
      by number of alleles (biallelic is 2, for example).

    Returns
    -------
    :obj:`None` or :class:`.Struct`
        Returns ``None`` if `show` is ``True``, or returns results as a struct.
    """
    require_row_key_variant(mt, 'summarize_variants')
    alleles_per_variant = hl.range(1, hl.len(mt.alleles)).map(lambda i: hl.allele_type(mt.alleles[0], mt.alleles[i]))
    allele_types, contigs, allele_counts, n_variants = mt.aggregate_rows(
        (hl.agg.explode(lambda elt: hl.agg.counter(elt), alleles_per_variant),
         hl.agg.counter(mt.locus.contig),
         hl.agg.counter(hl.len(mt.alleles)),
         hl.agg.count()))
    rg = mt.locus.dtype.reference_genome
    contig_idx = {contig: i for i, contig in enumerate(rg.contigs)}
    if show:
        max_contig_len = max(len(contig) for contig in contigs)
        contig_formatter = f'%{max_contig_len}s'

        max_allele_count_len = max(len(str(x)) for x in allele_counts)
        allele_count_formatter = f'%{max_allele_count_len}s'

        max_allele_type_len = max(len(x) for x in allele_types)
        allele_type_formatter = f'%{max_allele_type_len}s'

        line_break = '=============================='

        print(line_break)
        print(f'Number of variants: {n_variants}')
        print(line_break)
        print('Alleles per variant')
        print('-------------------')
        for n_alleles, count in sorted(allele_counts.items(), key=lambda x: x[0]):
            print(f'  {allele_count_formatter % n_alleles} alleles: {count} variants')
        print(line_break)
        print('Variants per contig')
        print('-------------------')
        for contig, count in sorted(contigs.items(), key=lambda x: contig_idx[x[0]]):
            print(f'  {contig_formatter % contig}: {count} variants')
        print(line_break)
        print('Allele type distribution')
        print('------------------------')
        for allele_type, count in Counter(allele_types).most_common():
            print(f'  {allele_type_formatter % allele_type}: {count} alternate alleles')
        print(line_break)
    else:
        return hl.Struct(allele_types=allele_types,
                         contigs=contigs,
                         allele_counts=allele_counts,
                         n_variants=n_variants)
コード例 #13
0
def variant_qc(dataset, name='variant_qc') -> MatrixTable:
    """Compute common variant statistics (quality control metrics).

    .. include:: ../_templates/req_biallelic.rst
    .. include:: ../_templates/req_tvariant.rst

    Examples
    --------

    >>> dataset_result = hl.variant_qc(dataset)

    Notes
    -----
    This method computes 18 variant statistics from the genotype data,
    returning a new struct field `name` with the following metrics:

    +-------------------------+---------+--------------------------------------------------------+
    | Name                    | Type    | Description                                            |
    +=========================+=========+========================================================+
    | ``call_rate``           | float64 | Fraction of samples with called genotypes              |
    +-------------------------+---------+--------------------------------------------------------+
    | ``AF``                  | float64 | Calculated alternate allele frequency (q)              |
    +-------------------------+---------+--------------------------------------------------------+
    | ``AC``                  | int32   | Count of alternate alleles                             |
    +-------------------------+---------+--------------------------------------------------------+
    | ``r_heterozygosity``    | float64 | Proportion of heterozygotes                            |
    +-------------------------+---------+--------------------------------------------------------+
    | ``r_het_hom_var``       | float64 | Ratio of heterozygotes to homozygous alternates        |
    +-------------------------+---------+--------------------------------------------------------+
    | ``r_expected_het_freq`` | float64 | Expected r_heterozygosity based on HWE                 |
    +-------------------------+---------+--------------------------------------------------------+
    | ``p_hwe``               | float64 | p-value from Hardy Weinberg Equilibrium null model     |
    +-------------------------+---------+--------------------------------------------------------+
    | ``n_hom_ref``           | int32   | Number of homozygous reference samples                 |
    +-------------------------+---------+--------------------------------------------------------+
    | ``n_het``               | int32   | Number of heterozygous samples                         |
    +-------------------------+---------+--------------------------------------------------------+
    | ``n_hom_var``           | int32   | Number of homozygous alternate samples                 |
    +-------------------------+---------+--------------------------------------------------------+
    | ``n_called``            | int32   | Sum of ``n_hom_ref``, ``n_het``, and ``n_hom_var``     |
    +-------------------------+---------+--------------------------------------------------------+
    | ``n_not_called``        | int32   | Number of uncalled samples                             |
    +-------------------------+---------+--------------------------------------------------------+
    | ``n_non_ref``           | int32   | Sum of ``n_het`` and ``n_hom_var``                     |
    +-------------------------+---------+--------------------------------------------------------+
    | ``dp_mean``             | float64 | Depth mean across all samples                          |
    +-------------------------+---------+--------------------------------------------------------+
    | ``dp_stdev``            | float64 | Depth standard deviation across all samples            |
    +-------------------------+---------+--------------------------------------------------------+
    | ``gq_mean``             | float64 | The average genotype quality across all samples        |
    +-------------------------+---------+--------------------------------------------------------+
    | ``gq_stdev``            | float64 | Genotype quality standard deviation across all samples |
    +-------------------------+---------+--------------------------------------------------------+

    Missing values ``NA`` may result from division by zero. The empirical
    standard deviation is computed with zero degrees of freedom.

    Parameters
    ----------
    dataset : :class:`.MatrixTable`
        Dataset.
    name : :obj:`str`
        Name for resulting field.

    Returns
    -------
    :class:`.MatrixTable`
        Dataset with a new row-indexed field `name`.
    """

    return MatrixTable(Env.hail().methods.VariantQC.apply(
        require_biallelic(dataset, 'variant_qc')._jvds, name))
コード例 #14
0
def sample_qc(dataset, name='sample_qc') -> MatrixTable:
    """Compute per-sample metrics useful for quality control.

    .. include:: ../_templates/req_tvariant.rst

    Examples
    --------

    Compute sample QC metrics and remove low-quality samples:

    >>> dataset = hl.sample_qc(dataset, name='sample_qc')
    >>> filtered_dataset = dataset.filter_cols((dataset.sample_qc.dp_mean > 20) & (dataset.sample_qc.r_ti_tv > 1.5))

    Notes
    -----

    This method computes summary statistics per sample from a genetic matrix and stores the results as
    a new column-indexed field in the matrix, named based on the `name` parameter.

    +--------------------------+-------+-+------------------------------------------------------+
    | Name                     | Type    | Description                                          |
    +==========================+=========+======================================================+
    | ``call_rate``            | float64 | Fraction of calls non-missing                        |
    +--------------------------+---------+------------------------------------------------------+
    | ``n_hom_ref``            | int64   | Number of homozygous reference calls                 |
    +--------------------------+---------+------------------------------------------------------+
    | ``n_het``                | int64   | Number of heterozygous calls                         |
    +--------------------------+---------+------------------------------------------------------+
    | ``n_hom_var``            | int64   | Number of homozygous alternate calls                 |
    +--------------------------+---------+------------------------------------------------------+
    | ``n_called``             | int64   | Sum of ``n_hom_ref`` + ``n_het`` + ``n_hom_var``     |
    +--------------------------+---------+------------------------------------------------------+
    | ``n_not_called``         | int64   | Number of missing calls                              |
    +--------------------------+---------+------------------------------------------------------+
    | ``n_snp``                | int64   | Number of SNP alternate alleles                      |
    +--------------------------+---------+------------------------------------------------------+
    | ``n_insertion``          | int64   | Number of insertion alternate alleles                |
    +--------------------------+---------+------------------------------------------------------+
    | ``n_deletion``           | int64   | Number of deletion alternate alleles                 |
    +--------------------------+---------+------------------------------------------------------+
    | ``n_singleton``          | int64   | Number of private alleles                            |
    +--------------------------+---------+------------------------------------------------------+
    | ``n_transition``         | int64   | Number of transition (A-G, C-T) alternate alleles    |
    +--------------------------+---------+------------------------------------------------------+
    | ``n_transversion``       | int64   | Number of transversion alternate alleles             |
    +--------------------------+---------+------------------------------------------------------+
    | ``n_star``               | int64   | Number of star (upstream deletion) alleles           |
    +--------------------------+---------+------------------------------------------------------+
    | ``n_non_ref``            | int64   | Sum of ``n_het`` and ``n_hom_var``                   |
    +--------------------------+---------+------------------------------------------------------+
    | ``r_ti_tv``              | float64 | Transition/Transversion ratio                        |
    +--------------------------+---------+------------------------------------------------------+
    | ``r_het_hom_var``        | float64 | Het/HomVar call ratio                                |
    +--------------------------+---------+------------------------------------------------------+
    | ``r_insertion_deletion`` | float64 | Insertion/Deletion allele ratio                      |
    +--------------------------+---------+------------------------------------------------------+
    | ``dp_mean``              | float64 | Depth mean across all calls                          |
    +--------------------------+---------+------------------------------------------------------+
    | ``dp_stdev``             | float64 | Depth standard deviation across all calls            |
    +--------------------------+---------+------------------------------------------------------+
    | ``gq_mean``              | float64 | The average genotype quality across all calls        |
    +--------------------------+---------+------------------------------------------------------+
    | ``gq_stdev``             | float64 | Genotype quality standard deviation across all calls |
    +--------------------------+---------+------------------------------------------------------+

    Missing values ``NA`` may result from division by zero. The empirical
    standard deviation is computed with zero degrees of freedom.

    Parameters
    ----------
    dataset : :class:`.MatrixTable`
        Dataset.
    name : :obj:`str`
        Name for resulting field.

    Returns
    -------
    :class:`.MatrixTable`
        Dataset with a new column-indexed field `name`.
    """

    return MatrixTable(Env.hail().methods.SampleQC.apply(
        require_biallelic(dataset, 'sample_qc')._jvds, name))
コード例 #15
0
ファイル: backend.py プロジェクト: shutianxu/hail
 def persist_matrix_table(self, mt, storage_level):
     return MatrixTable._from_java(self._to_java_ir(mt._mir).pyPersist(storage_level))
コード例 #16
0
ファイル: array.py プロジェクト: theferrit32/hail
    def from_matrix_table(
            mt: MatrixTable,
            entrc_field: str,
            *,
            n_partitions: Optional[int] = None,
            block_size: Optional[int] = None
    ) -> 'DNDArray':
        if n_partitions is None:
            n_partitions = mt.n_partitions()
        if block_size is None:
            block_size = DNDArray.default_block_size
        if n_partitions == 0:
            assert mt.count_cols() == 0
            assert mt.count_rows() == 0
            t = range_table(0, 0)
            t = t.annotate(r=0, c=0, block=nd.array([]).reshape((0, 0)))
            t = t.select_globals(
                r_field='r',
                c_field='c',
                n_rows=0,
                n_cols=0,
                n_block_rows=0,
                n_block_cols=0,
                block_size=0)
            return DNDArray(t)

        assert 'r' not in mt.row
        assert 'c' not in mt.row
        assert 'block' not in mt.row

        n_rows, n_cols = mt.count()
        n_block_rows = (n_rows + block_size - 1) // block_size
        n_block_cols = (n_cols + block_size - 1) // block_size
        entries, cols, row_index, col_blocks = (Env.get_uid() for _ in range(4))
        mt = (mt
              .select_globals()
              .select_rows()
              .select_cols()
              .add_row_index(row_index)
              .localize_entries(entries, cols))
        # FIXME: remove when ndarray support structs
        mt = mt.annotate(**{entries: mt[entries][entrc_field]})
        mt = mt.annotate(
            **{col_blocks: hl.range(n_block_cols).map(
                lambda c: hl.struct(
                    c=c,
                    entries=mt[entries][(c * block_size):((c + 1) * block_size)]))}
        )
        mt = mt.explode(col_blocks)
        mt = mt.select(row_index, **mt[col_blocks])
        mt = mt.annotate(r=hl.int(mt[row_index] // block_size))
        mt = mt.key_by(mt.r, mt.c)
        mt = mt.group_by(mt.r, mt.c).aggregate(
            entries=hl.sorted(
                hl.agg.collect(hl.struct(row_index=mt[row_index], entries=mt.entries)),
                key=lambda x: x.row_index
            ).map(lambda x: x.entries))
        mt = mt.select(block=hl.nd.array(mt.entries))
        mt = mt.select_globals(
            r_field='r',
            c_field='c',
            n_rows=n_rows,
            n_cols=n_cols,
            n_block_rows=n_block_rows,
            n_block_cols=n_block_cols,
            block_size=block_size)
        fname = new_temp_file()
        mt = mt.key_by(mt.r, mt.c)
        mt.write(fname, _codec_spec=DNDArray.fast_codec_spec)
        t = hl.read_table(fname, _intervals=[
            hl.Interval(hl.Struct(r=i, c=j),
                        hl.Struct(r=i, c=j + 1))
            for i in range(n_block_rows)
            for j in range(n_block_cols)])
        return DNDArray(t)
コード例 #17
0
ファイル: family_methods.py プロジェクト: tianyunwang/hail
def de_novo(mt: MatrixTable,
            pedigree: Pedigree,
            pop_frequency_prior,
            *,
            min_gq: int = 20,
            min_p: float = 0.05,
            max_parent_ab: float = 0.05,
            min_child_ab: float = 0.20,
            min_dp_ratio: float = 0.10) -> Table:
    r"""Call putative *de novo* events from trio data.

    .. include:: ../_templates/req_tstring.rst

    .. include:: ../_templates/req_tvariant.rst

    .. include:: ../_templates/req_biallelic.rst

    Examples
    --------

    Call de novo events:

    >>> pedigree = hl.Pedigree.read('data/trios.fam')
    >>> priors = hl.import_table('data/gnomadFreq.tsv', impute=True)
    >>> priors = priors.transmute(**hl.parse_variant(priors.Variant)).key_by('locus', 'alleles')
    >>> de_novo_results = hl.de_novo(dataset, pedigree, pop_frequency_prior=priors[dataset.row_key].AF)

    Notes
    -----
    This method assumes the GATK high-throughput sequencing fields exist:
    `GT`, `AD`, `DP`, `GQ`, `PL`.

    This method replicates the functionality of `Kaitlin Samocha's de novo
    caller <https://github.com/ksamocha/de_novo_scripts>`__. The version
    corresponding to git commit ``bde3e40`` is implemented in Hail with her
    permission and assistance.

    This method produces a :class:`.Table` with the following fields:

     - `locus` (``locus``) -- Variant locus.
     - `alleles` (``array<str>``) -- Variant alleles.
     - `id` (``str``) -- Proband sample ID.
     - `prior` (``float64``) -- Site frequency prior. It is the maximum of:
       the computed dataset alternate allele frequency, the
       `pop_frequency_prior` parameter, and the global prior
       ``1 / 3e7``.
     - `proband` (``struct``) -- Proband column fields from `mt`.
     - `father` (``struct``) -- Father column fields from `mt`.
     - `mother` (``struct``) -- Mother column fields from `mt`.
     - `proband_entry` (``struct``) -- Proband entry fields from `mt`.
     - `father_entry` (``struct``) -- Father entry fields from `mt`.
     - `proband_entry` (``struct``) -- Mother entry fields from `mt`.
     - `is_female` (``bool``) -- ``True`` if proband is female.
     - `p_de_novo` (``float64``) -- Unfiltered posterior probability
       that the event is *de novo* rather than a missed heterozygous
       event in a parent.
     - `confidence` (``str``) Validation confidence. One of: ``'HIGH'``,
       ``'MEDIUM'``, ``'LOW'``.

    The key of the table is ``['locus', 'alleles', 'id']``.

    The model looks for de novo events in which both parents are homozygous
    reference and the proband is a heterozygous. The model makes the simplifying
    assumption that when this configuration ``x = (AA, AA, AB)`` of calls
    occurs, exactly one of the following is true:

     - ``d``: a de novo mutation occurred in the proband and all calls are
       accurate.
     - ``m``: at least one parental allele is actually heterozygous and
       the proband call is accurate.

    We can then estimate the posterior probability of a de novo mutation as:

    .. math::

        \mathrm{P_{\text{de novo}}} = \frac{\mathrm{P}(d\,|\,x)}{\mathrm{P}(d\,|\,x) + \mathrm{P}(m\,|\,x)}

    Applying Bayes rule to the numerator and denominator yields

    .. math::

        \frac{\mathrm{P}(x\,|\,d)\,\mathrm{P}(d)}{\mathrm{P}(x\,|\,d)\,\mathrm{P}(d) +
        \mathrm{P}(x\,|\,m)\,\mathrm{P}(m)}

    The prior on de novo mutation is estimated from the rate in the literature:

    .. math::

        \mathrm{P}(d) = \frac{1 \text{mutation}}{30,000,000\, \text{bases}}

    The prior used for at least one alternate allele between the parents
    depends on the alternate allele frequency:

    .. math::

        \mathrm{P}(m) = 1 - (1 - AF)^4

    The likelihoods :math:`\mathrm{P}(x\,|\,d)` and :math:`\mathrm{P}(x\,|\,m)`
    are computed from the PL (genotype likelihood) fields using these
    factorizations:

    .. math::

        \mathrm{P}(x = (AA, AA, AB) \,|\,d) = \Big(
        &\mathrm{P}(x_{\mathrm{father}} = AA \,|\, \mathrm{father} = AA) \\
        \cdot &\mathrm{P}(x_{\mathrm{mother}} = AA \,|\, \mathrm{mother} =
        AA) \\ \cdot &\mathrm{P}(x_{\mathrm{proband}} = AB \,|\,
        \mathrm{proband} = AB) \Big)

    .. math::

        \mathrm{P}(x = (AA, AA, AB) \,|\,m) = \Big( &
        \mathrm{P}(x_{\mathrm{father}} = AA \,|\, \mathrm{father} = AB)
        \cdot \mathrm{P}(x_{\mathrm{mother}} = AA \,|\, \mathrm{mother} =
        AA) \\ + \, &\mathrm{P}(x_{\mathrm{father}} = AA \,|\,
        \mathrm{father} = AA) \cdot \mathrm{P}(x_{\mathrm{mother}} = AA
        \,|\, \mathrm{mother} = AB) \Big) \\ \cdot \,
        &\mathrm{P}(x_{\mathrm{proband}} = AB \,|\, \mathrm{proband} = AB)

    (Technically, the second factorization assumes there is exactly (rather
    than at least) one alternate allele among the parents, which may be
    justified on the grounds that it is typically the most likely case by far.)

    While this posterior probability is a good metric for grouping putative de
    novo mutations by validation likelihood, there exist error modes in
    high-throughput sequencing data that are not appropriately accounted for by
    the phred-scaled genotype likelihoods. To this end, a number of hard filters
    are applied in order to assign validation likelihood.

    These filters are different for SNPs and insertions/deletions. In the below
    rules, the following variables are used:

     - ``DR`` refers to the ratio of the read depth in the proband to the
       combined read depth in the parents.
     - ``AB`` refers to the read allele balance of the proband (number of
       alternate reads divided by total reads).
     - ``AC`` refers to the count of alternate alleles across all individuals
       in the dataset at the site.
     - ``p`` refers to :math:`\mathrm{P_{\text{de novo}}}`.
     - ``min_p`` refers to the ``min_p`` function parameter.

    HIGH-quality SNV:

    .. code-block:: text

        p > 0.99 && AB > 0.3 && DR > 0.2
            or
        p > 0.99 && AB > 0.3 && AC == 1

    MEDIUM-quality SNV:

    .. code-block:: text

        p > 0.5 && AB > 0.3
            or
        p > 0.5 && AB > 0.2 && AC == 1

    LOW-quality SNV:

    .. code-block:: text

        p > min_p && AB > 0.2

    HIGH-quality indel:

    .. code-block:: text

        p > 0.99 && AB > 0.3 && DR > 0.2
            or
        p > 0.99 && AB > 0.3 && AC == 1

    MEDIUM-quality indel:

    .. code-block:: text

        p > 0.5 && AB > 0.3
            or
        p > 0.5 && AB > 0.2 and AC == 1

    LOW-quality indel:

    .. code-block:: text

        p > min_p && AB > 0.2

    Additionally, de novo candidates are not considered if the proband GQ is
    smaller than the ``min_gq`` parameter, if the proband allele balance is
    lower than the ``min_child_ab`` parameter, if the depth ratio between the
    proband and parents is smaller than the ``min_depth_ratio`` parameter, or if
    the allele balance in a parent is above the ``max_parent_ab`` parameter.

    Parameters
    ----------
    mt : :class:`.MatrixTable`
        High-throughput sequencing dataset.
    pedigree : :class:`.Pedigree`
        Sample pedigree.
    pop_frequency_prior : :class:`.Float64Expression`
        Expression for population alternate allele frequency prior.
    min_gq
        Minimum proband GQ to be considered for *de novo* calling.
    min_p
        Minimum posterior probability to be considered for *de novo* calling.
    max_parent_ab
        Maximum parent allele balance.
    min_child_ab
        Minimum proband allele balance/
    min_dp_ratio
        Minimum ratio between proband read depth and parental read depth.

    Returns
    -------
    :class:`.Table`
    """
    DE_NOVO_PRIOR = 1 / 30000000
    MIN_POP_PRIOR = 100 / 30000000

    required_entry_fields = {'GT', 'AD', 'DP', 'GQ', 'PL'}
    missing_fields = required_entry_fields - set(mt.entry)
    if missing_fields:
        raise ValueError(
            f"'de_novo': expected 'MatrixTable' to have at least {required_entry_fields}, "
            f"missing {missing_fields}")

    mt = mt.annotate_rows(__prior=pop_frequency_prior,
                          __alt_alleles=hl.agg.sum(mt.GT.n_alt_alleles()),
                          __total_alleles=2 * hl.agg.sum(hl.is_defined(mt.GT)))
    # subtract 1 from __alt_alleles to correct for the observed genotype
    mt = mt.annotate_rows(
        __site_freq=hl.max((mt.__alt_alleles - 1) /
                           mt.__total_alleles, mt.__prior, MIN_POP_PRIOR))
    mt = require_biallelic(mt, 'de_novo')

    # FIXME check that __site_freq is between 0 and 1 when possible in expr
    tm = trio_matrix(mt, pedigree, complete_trios=True)

    autosomal = tm.locus.in_autosome_or_par() | (tm.locus.in_x_nonpar()
                                                 & tm.is_female)
    hemi_x = tm.locus.in_x_nonpar() & ~tm.is_female
    hemi_y = tm.locus.in_y_nonpar() & ~tm.is_female
    hemi_mt = tm.locus.in_mito() & tm.is_female

    is_snp = hl.is_snp(tm.alleles[0], tm.alleles[1])
    n_alt_alleles = tm.__alt_alleles
    prior = tm.__site_freq
    het_hom_hom = tm.proband_entry.GT.is_het() & tm.father_entry.GT.is_hom_ref(
    ) & tm.mother_entry.GT.is_hom_ref()
    kid_ad_fail = tm.proband_entry.AD[1] / hl.sum(
        tm.proband_entry.AD) < min_child_ab

    failure = hl.null(hl.tstruct(p_de_novo=hl.tfloat64, confidence=hl.tstr))

    kid = tm.proband_entry
    dad = tm.father_entry
    mom = tm.mother_entry

    kid_linear_pl = 10**(-kid.PL / 10)
    kid_pp = hl.bind(lambda x: x / hl.sum(x), kid_linear_pl)

    dad_linear_pl = 10**(-dad.PL / 10)
    dad_pp = hl.bind(lambda x: x / hl.sum(x), dad_linear_pl)

    mom_linear_pl = 10**(-mom.PL / 10)
    mom_pp = hl.bind(lambda x: x / hl.sum(x), mom_linear_pl)

    kid_ad_ratio = kid.AD[1] / hl.sum(kid.AD)
    dp_ratio = kid.DP / (dad.DP + mom.DP)

    def call_auto(kid_pp, dad_pp, mom_pp, kid_ad_ratio):
        p_data_given_dn = dad_pp[0] * mom_pp[0] * kid_pp[1] * DE_NOVO_PRIOR
        p_het_in_parent = 1 - (1 - prior)**4
        p_data_given_missed_het = (dad_pp[1] * mom_pp[0] + dad_pp[0] *
                                   mom_pp[1]) * kid_pp[1] * p_het_in_parent
        p_de_novo = p_data_given_dn / (p_data_given_dn +
                                       p_data_given_missed_het)

        def solve(p_de_novo):
            return (hl.case().when(kid.GQ < min_gq, failure).when(
                (kid.DP / (dad.DP + mom.DP) < min_dp_ratio)
                | ~(kid_ad_ratio >= min_child_ab), failure).when(
                    (hl.sum(mom.AD) == 0) | (hl.sum(dad.AD) == 0),
                    failure).when(
                        (mom.AD[1] / hl.sum(mom.AD) > max_parent_ab) |
                        (dad.AD[1] / hl.sum(dad.AD) > max_parent_ab),
                        failure).when(p_de_novo < min_p, failure).when(
                            ~is_snp,
                            hl.case().when(
                                (p_de_novo > 0.99) & (kid_ad_ratio > 0.3) &
                                (n_alt_alleles == 1),
                                hl.struct(p_de_novo=p_de_novo,
                                          confidence='HIGH')).when(
                                              (p_de_novo > 0.5) &
                                              (kid_ad_ratio > 0.3) &
                                              (n_alt_alleles <= 5),
                                              hl.struct(
                                                  p_de_novo=p_de_novo,
                                                  confidence='MEDIUM')).when(
                                                      (p_de_novo > 0.05) &
                                                      (kid_ad_ratio > 0.2),
                                                      hl.struct(
                                                          p_de_novo=p_de_novo,
                                                          confidence='LOW')).
                            or_missing()).default(hl.case().when(
                                ((p_de_novo > 0.99) & (kid_ad_ratio > 0.3) &
                                 (dp_ratio > 0.2)) | ((p_de_novo > 0.99) &
                                                      (kid_ad_ratio > 0.3) &
                                                      (n_alt_alleles == 1)) |
                                ((p_de_novo > 0.5) & (kid_ad_ratio > 0.3) &
                                 (n_alt_alleles < 10) & (kid.DP > 10)),
                                hl.struct(p_de_novo=p_de_novo,
                                          confidence='HIGH')).when(
                                              (p_de_novo > 0.5) &
                                              ((kid_ad_ratio > 0.3) |
                                               (n_alt_alleles == 1)),
                                              hl.struct(
                                                  p_de_novo=p_de_novo,
                                                  confidence='MEDIUM')).when(
                                                      (p_de_novo > 0.05) &
                                                      (kid_ad_ratio > 0.2),
                                                      hl.struct(
                                                          p_de_novo=p_de_novo,
                                                          confidence='LOW')).
                                                  or_missing()))

        return hl.bind(solve, p_de_novo)

    def call_hemi(kid_pp, parent, parent_pp, kid_ad_ratio):
        p_data_given_dn = parent_pp[0] * kid_pp[1] * DE_NOVO_PRIOR
        p_het_in_parent = 1 - (1 - prior)**4
        p_data_given_missed_het = (parent_pp[1] +
                                   parent_pp[2]) * kid_pp[2] * p_het_in_parent
        p_de_novo = p_data_given_dn / (p_data_given_dn +
                                       p_data_given_missed_het)

        def solve(p_de_novo):
            return (hl.case().when(kid.GQ < min_gq, failure).when(
                (kid.DP /
                 (parent.DP) < min_dp_ratio) | (kid_ad_ratio < min_child_ab),
                failure).when((hl.sum(parent.AD) == 0), failure).when(
                    parent.AD[1] / hl.sum(parent.AD) > max_parent_ab,
                    failure).when(p_de_novo < min_p, failure).when(
                        ~is_snp,
                        hl.case().when(
                            (p_de_novo > 0.99) & (kid_ad_ratio > 0.3) &
                            (n_alt_alleles == 1),
                            hl.struct(
                                p_de_novo=p_de_novo, confidence='HIGH')).when(
                                    (p_de_novo > 0.5) & (kid_ad_ratio > 0.3) &
                                    (n_alt_alleles <= 5),
                                    hl.struct(p_de_novo=p_de_novo,
                                              confidence='MEDIUM')).when(
                                                  (p_de_novo > 0.05) &
                                                  (kid_ad_ratio > 0.3),
                                                  hl.struct(
                                                      p_de_novo=p_de_novo,
                                                      confidence='LOW')).
                        or_missing()).default(
                            hl.case().when(
                                ((p_de_novo > 0.99) & (kid_ad_ratio > 0.3) &
                                 (dp_ratio > 0.2)) |
                                ((p_de_novo > 0.99) & (kid_ad_ratio > 0.3) &
                                 (n_alt_alleles == 1)) |
                                ((p_de_novo > 0.5) & (kid_ad_ratio > 0.3) &
                                 (n_alt_alleles < 10) & (kid.DP > 10)),
                                hl.struct(p_de_novo=p_de_novo,
                                          confidence='HIGH')).when(
                                              (p_de_novo > 0.5) &
                                              ((kid_ad_ratio > 0.3) |
                                               (n_alt_alleles == 1)),
                                              hl.struct(p_de_novo=p_de_novo,
                                                        confidence='MEDIUM')).
                            when((p_de_novo > 0.05) & (kid_ad_ratio > 0.2),
                                 hl.struct(p_de_novo=p_de_novo,
                                           confidence='LOW')).or_missing()))

        return hl.bind(solve, p_de_novo)

    de_novo_call = (hl.case().when(~het_hom_hom | kid_ad_fail, failure).when(
        autosomal,
        hl.bind(call_auto, kid_pp, dad_pp, mom_pp, kid_ad_ratio)).when(
            hemi_x | hemi_mt,
            hl.bind(call_hemi, kid_pp, mom, mom_pp, kid_ad_ratio)).when(
                hemi_y, hl.bind(call_hemi, kid_pp, dad, dad_pp,
                                kid_ad_ratio)).or_missing())

    tm = tm.annotate_entries(__call=de_novo_call)
    tm = tm.filter_entries(hl.is_defined(tm.__call))
    entries = tm.entries()
    return (entries.select('__site_freq', 'proband', 'father', 'mother',
                           'proband_entry', 'father_entry', 'mother_entry',
                           'is_female',
                           **entries.__call).rename({'__site_freq': 'prior'}))
コード例 #18
0
ファイル: family_methods.py プロジェクト: bcajes/hail
def de_novo(mt: MatrixTable,
            pedigree: Pedigree,
            pop_frequency_prior,
            *,
            min_gq: int = 20,
            min_p: float = 0.05,
            max_parent_ab: float = 0.05,
            min_child_ab: float = 0.20,
            min_dp_ratio: float = 0.10) -> Table:
    r"""Call putative *de novo* events from trio data.

    .. include:: ../_templates/req_tstring.rst

    .. include:: ../_templates/req_tvariant.rst

    .. include:: ../_templates/req_biallelic.rst

    Examples
    --------

    Call de novo events:

    >>> pedigree = hl.Pedigree.read('data/trios.fam')
    >>> priors = hl.import_table('data/gnomadFreq.tsv', impute=True)
    >>> priors = priors.transmute(**hl.parse_variant(priors.Variant)).key_by('locus', 'alleles')
    >>> de_novo_results = hl.de_novo(dataset, pedigree, pop_frequency_prior=priors[dataset.row_key].AF)

    Notes
    -----
    This method assumes the GATK high-throughput sequencing fields exist:
    `GT`, `AD`, `DP`, `GQ`, `PL`.

    This method replicates the functionality of `Kaitlin Samocha's de novo
    caller <https://github.com/ksamocha/de_novo_scripts>`__. The version
    corresponding to git commit ``bde3e40`` is implemented in Hail with her
    permission and assistance.

    This method produces a :class:`.Table` with the following fields:

     - `locus` (``locus``) -- Variant locus.
     - `alleles` (``array<str>``) -- Variant alleles.
     - `id` (``str``) -- Proband sample ID.
     - `prior` (``float64``) -- Site frequency prior. It is the maximum of:
       the computed dataset alternate allele frequency, the
       `pop_frequency_prior` parameter, and the global prior
       ``1 / 3e7``.
     - `proband` (``struct``) -- Proband column fields from `mt`.
     - `father` (``struct``) -- Father column fields from `mt`.
     - `mother` (``struct``) -- Mother column fields from `mt`.
     - `proband_entry` (``struct``) -- Proband entry fields from `mt`.
     - `father_entry` (``struct``) -- Father entry fields from `mt`.
     - `proband_entry` (``struct``) -- Mother entry fields from `mt`.
     - `is_female` (``bool``) -- ``True`` if proband is female.
     - `p_de_novo` (``float64``) -- Unfiltered posterior probability
       that the event is *de novo* rather than a missed heterozygous
       event in a parent.
     - `confidence` (``str``) Validation confidence. One of: ``'HIGH'``,
       ``'MEDIUM'``, ``'LOW'``.

    The key of the table is ``['locus', 'alleles', 'id']``.

    The model looks for de novo events in which both parents are homozygous
    reference and the proband is a heterozygous. The model makes the simplifying
    assumption that when this configuration ``x = (AA, AA, AB)`` of calls
    occurs, exactly one of the following is true:

     - ``d``: a de novo mutation occurred in the proband and all calls are
       accurate.
     - ``m``: at least one parental allele is actually heterozygous and
       the proband call is accurate.

    We can then estimate the posterior probability of a de novo mutation as:

    .. math::

        \mathrm{P_{\text{de novo}}} = \frac{\mathrm{P}(d\,|\,x)}{\mathrm{P}(d\,|\,x) + \mathrm{P}(m\,|\,x)}

    Applying Bayes rule to the numerator and denominator yields

    .. math::

        \frac{\mathrm{P}(x\,|\,d)\,\mathrm{P}(d)}{\mathrm{P}(x\,|\,d)\,\mathrm{P}(d) +
        \mathrm{P}(x\,|\,m)\,\mathrm{P}(m)}

    The prior on de novo mutation is estimated from the rate in the literature:

    .. math::

        \mathrm{P}(d) = \frac{1 \text{mutation}}{30,000,000\, \text{bases}}

    The prior used for at least one alternate allele between the parents
    depends on the alternate allele frequency:

    .. math::

        \mathrm{P}(m) = 1 - (1 - AF)^4

    The likelihoods :math:`\mathrm{P}(x\,|\,d)` and :math:`\mathrm{P}(x\,|\,m)`
    are computed from the PL (genotype likelihood) fields using these
    factorizations:

    .. math::

        \mathrm{P}(x = (AA, AA, AB) \,|\,d) = \Big(
        &\mathrm{P}(x_{\mathrm{father}} = AA \,|\, \mathrm{father} = AA) \\
        \cdot &\mathrm{P}(x_{\mathrm{mother}} = AA \,|\, \mathrm{mother} =
        AA) \\ \cdot &\mathrm{P}(x_{\mathrm{proband}} = AB \,|\,
        \mathrm{proband} = AB) \Big)

    .. math::

        \mathrm{P}(x = (AA, AA, AB) \,|\,m) = \Big( &
        \mathrm{P}(x_{\mathrm{father}} = AA \,|\, \mathrm{father} = AB)
        \cdot \mathrm{P}(x_{\mathrm{mother}} = AA \,|\, \mathrm{mother} =
        AA) \\ + \, &\mathrm{P}(x_{\mathrm{father}} = AA \,|\,
        \mathrm{father} = AA) \cdot \mathrm{P}(x_{\mathrm{mother}} = AA
        \,|\, \mathrm{mother} = AB) \Big) \\ \cdot \,
        &\mathrm{P}(x_{\mathrm{proband}} = AB \,|\, \mathrm{proband} = AB)

    (Technically, the second factorization assumes there is exactly (rather
    than at least) one alternate allele among the parents, which may be
    justified on the grounds that it is typically the most likely case by far.)

    While this posterior probability is a good metric for grouping putative de
    novo mutations by validation likelihood, there exist error modes in
    high-throughput sequencing data that are not appropriately accounted for by
    the phred-scaled genotype likelihoods. To this end, a number of hard filters
    are applied in order to assign validation likelihood.

    These filters are different for SNPs and insertions/deletions. In the below
    rules, the following variables are used:

     - ``DR`` refers to the ratio of the read depth in the proband to the
       combined read depth in the parents.
     - ``AB`` refers to the read allele balance of the proband (number of
       alternate reads divided by total reads).
     - ``AC`` refers to the count of alternate alleles across all individuals
       in the dataset at the site.
     - ``p`` refers to :math:`\mathrm{P_{\text{de novo}}}`.
     - ``min_p`` refers to the ``min_p`` function parameter.

    HIGH-quality SNV:

    .. code-block:: text

        p > 0.99 && AB > 0.3 && DR > 0.2
            or
        p > 0.99 && AB > 0.3 && AC == 1

    MEDIUM-quality SNV:

    .. code-block:: text

        p > 0.5 && AB > 0.3
            or
        p > 0.5 && AB > 0.2 && AC == 1

    LOW-quality SNV:

    .. code-block:: text

        p > min_p && AB > 0.2

    HIGH-quality indel:

    .. code-block:: text

        p > 0.99 && AB > 0.3 && DR > 0.2
            or
        p > 0.99 && AB > 0.3 && AC == 1

    MEDIUM-quality indel:

    .. code-block:: text

        p > 0.5 && AB > 0.3
            or
        p > 0.5 && AB > 0.2 and AC == 1

    LOW-quality indel:

    .. code-block:: text

        p > min_p && AB > 0.2

    Additionally, de novo candidates are not considered if the proband GQ is
    smaller than the ``min_gq`` parameter, if the proband allele balance is
    lower than the ``min_child_ab`` parameter, if the depth ratio between the
    proband and parents is smaller than the ``min_depth_ratio`` parameter, or if
    the allele balance in a parent is above the ``max_parent_ab`` parameter.

    Parameters
    ----------
    mt : :class:`.MatrixTable`
        High-throughput sequencing dataset.
    pedigree : :class:`.Pedigree`
        Sample pedigree.
    pop_frequency_prior : :class:`.Float64Expression`
        Expression for population alternate allele frequency prior.
    min_gq
        Minimum proband GQ to be considered for *de novo* calling.
    min_p
        Minimum posterior probability to be considered for *de novo* calling.
    max_parent_ab
        Maximum parent allele balance.
    min_child_ab
        Minimum proband allele balance/
    min_dp_ratio
        Minimum ratio between proband read depth and parental read depth.

    Returns
    -------
    :class:`.Table`
    """
    DE_NOVO_PRIOR = 1 / 30000000
    MIN_POP_PRIOR = 100 / 30000000

    required_entry_fields = {'GT', 'AD', 'DP', 'GQ', 'PL'}
    missing_fields = required_entry_fields - set(mt.entry)
    if missing_fields:
        raise ValueError(f"'de_novo': expected 'MatrixTable' to have at least {required_entry_fields}, "
                         f"missing {missing_fields}")

    mt = mt.annotate_rows(__prior=pop_frequency_prior,
                          __alt_alleles=hl.agg.sum(mt.GT.n_alt_alleles()),
                          __total_alleles=2 * hl.agg.sum(hl.is_defined(mt.GT)))
    # subtract 1 from __alt_alleles to correct for the observed genotype
    mt = mt.annotate_rows(__site_freq=hl.max((mt.__alt_alleles - 1) / mt.__total_alleles, mt.__prior, MIN_POP_PRIOR))
    mt = require_biallelic(mt, 'de_novo')

    # FIXME check that __site_freq is between 0 and 1 when possible in expr
    tm = trio_matrix(mt, pedigree, complete_trios=True)

    autosomal = tm.locus.in_autosome_or_par() | (tm.locus.in_x_nonpar() & tm.is_female)
    hemi_x = tm.locus.in_x_nonpar() & ~tm.is_female
    hemi_y = tm.locus.in_y_nonpar() & ~tm.is_female
    hemi_mt = tm.locus.in_mito() & tm.is_female

    is_snp = hl.is_snp(tm.alleles[0], tm.alleles[1])
    n_alt_alleles = tm.__alt_alleles
    prior = tm.__site_freq
    het_hom_hom = tm.proband_entry.GT.is_het() & tm.father_entry.GT.is_hom_ref() & tm.mother_entry.GT.is_hom_ref()
    kid_ad_fail = tm.proband_entry.AD[1] / hl.sum(tm.proband_entry.AD) < min_child_ab

    failure = hl.null(hl.tstruct(p_de_novo=hl.tfloat64, confidence=hl.tstr))

    kid = tm.proband_entry
    dad = tm.father_entry
    mom = tm.mother_entry

    kid_linear_pl = 10 ** (-kid.PL / 10)
    kid_pp = hl.bind(lambda x: x / hl.sum(x), kid_linear_pl)

    dad_linear_pl = 10 ** (-dad.PL / 10)
    dad_pp = hl.bind(lambda x: x / hl.sum(x), dad_linear_pl)

    mom_linear_pl = 10 ** (-mom.PL / 10)
    mom_pp = hl.bind(lambda x: x / hl.sum(x), mom_linear_pl)

    kid_ad_ratio = kid.AD[1] / hl.sum(kid.AD)
    dp_ratio = kid.DP / (dad.DP + mom.DP)

    def call_auto(kid_pp, dad_pp, mom_pp, kid_ad_ratio):
        p_data_given_dn = dad_pp[0] * mom_pp[0] * kid_pp[1] * DE_NOVO_PRIOR
        p_het_in_parent = 1 - (1 - prior) ** 4
        p_data_given_missed_het = (dad_pp[1] * mom_pp[0] + dad_pp[0] * mom_pp[1]) * kid_pp[1] * p_het_in_parent
        p_de_novo = p_data_given_dn / (p_data_given_dn + p_data_given_missed_het)

        def solve(p_de_novo):
            return (
                hl.case()
                    .when(kid.GQ < min_gq, failure)
                    .when((kid.DP / (dad.DP + mom.DP) < min_dp_ratio) |
                          ~(kid_ad_ratio >= min_child_ab), failure)
                    .when((hl.sum(mom.AD) == 0) | (hl.sum(dad.AD) == 0), failure)
                    .when((mom.AD[1] / hl.sum(mom.AD) > max_parent_ab) |
                          (dad.AD[1] / hl.sum(dad.AD) > max_parent_ab), failure)
                    .when(p_de_novo < min_p, failure)
                    .when(~is_snp, hl.case()
                          .when((p_de_novo > 0.99) & (kid_ad_ratio > 0.3) & (n_alt_alleles == 1),
                                hl.struct(p_de_novo=p_de_novo, confidence='HIGH'))
                          .when((p_de_novo > 0.5) & (kid_ad_ratio > 0.3) & (n_alt_alleles <= 5),
                                hl.struct(p_de_novo=p_de_novo, confidence='MEDIUM'))
                          .when((p_de_novo > 0.05) & (kid_ad_ratio > 0.2),
                                hl.struct(p_de_novo=p_de_novo, confidence='LOW'))
                          .or_missing())
                    .default(hl.case()
                             .when(((p_de_novo > 0.99) & (kid_ad_ratio > 0.3) & (dp_ratio > 0.2)) |
                                   ((p_de_novo > 0.99) & (kid_ad_ratio > 0.3) & (n_alt_alleles == 1)) |
                                   ((p_de_novo > 0.5) & (kid_ad_ratio > 0.3) & (n_alt_alleles < 10) & (kid.DP > 10)),
                                   hl.struct(p_de_novo=p_de_novo, confidence='HIGH'))
                             .when((p_de_novo > 0.5) & ((kid_ad_ratio > 0.3) | (n_alt_alleles == 1)),
                                   hl.struct(p_de_novo=p_de_novo, confidence='MEDIUM'))
                             .when((p_de_novo > 0.05) & (kid_ad_ratio > 0.2),
                                   hl.struct(p_de_novo=p_de_novo, confidence='LOW'))
                             .or_missing()
                             )
            )

        return hl.bind(solve, p_de_novo)

    def call_hemi(kid_pp, parent, parent_pp, kid_ad_ratio):
        p_data_given_dn = parent_pp[0] * kid_pp[1] * DE_NOVO_PRIOR
        p_het_in_parent = 1 - (1 - prior) ** 4
        p_data_given_missed_het = (parent_pp[1] + parent_pp[2]) * kid_pp[2] * p_het_in_parent
        p_de_novo = p_data_given_dn / (p_data_given_dn + p_data_given_missed_het)

        def solve(p_de_novo):
            return (
                hl.case()
                    .when(kid.GQ < min_gq, failure)
                    .when((kid.DP / (parent.DP) < min_dp_ratio) |
                          (kid_ad_ratio < min_child_ab), failure)
                    .when((hl.sum(parent.AD) == 0), failure)
                    .when(parent.AD[1] / hl.sum(parent.AD) > max_parent_ab, failure)
                    .when(p_de_novo < min_p, failure)
                    .when(~is_snp, hl.case()
                          .when((p_de_novo > 0.99) & (kid_ad_ratio > 0.3) & (n_alt_alleles == 1),
                                hl.struct(p_de_novo=p_de_novo, confidence='HIGH'))
                          .when((p_de_novo > 0.5) & (kid_ad_ratio > 0.3) & (n_alt_alleles <= 5),
                                hl.struct(p_de_novo=p_de_novo, confidence='MEDIUM'))
                          .when((p_de_novo > 0.05) & (kid_ad_ratio > 0.3),
                                hl.struct(p_de_novo=p_de_novo, confidence='LOW'))
                          .or_missing())
                    .default(hl.case()
                             .when(((p_de_novo > 0.99) & (kid_ad_ratio > 0.3) & (dp_ratio > 0.2)) |
                                   ((p_de_novo > 0.99) & (kid_ad_ratio > 0.3) & (n_alt_alleles == 1)) |
                                   ((p_de_novo > 0.5) & (kid_ad_ratio > 0.3) & (n_alt_alleles < 10) & (kid.DP > 10)),
                                   hl.struct(p_de_novo=p_de_novo, confidence='HIGH'))
                             .when((p_de_novo > 0.5) & ((kid_ad_ratio > 0.3) | (n_alt_alleles == 1)),
                                   hl.struct(p_de_novo=p_de_novo, confidence='MEDIUM'))
                             .when((p_de_novo > 0.05) & (kid_ad_ratio > 0.2),
                                   hl.struct(p_de_novo=p_de_novo, confidence='LOW'))
                             .or_missing()
                             )
            )

        return hl.bind(solve, p_de_novo)

    de_novo_call = (
        hl.case()
            .when(~het_hom_hom | kid_ad_fail, failure)
            .when(autosomal, hl.bind(call_auto, kid_pp, dad_pp, mom_pp, kid_ad_ratio))
            .when(hemi_x | hemi_mt, hl.bind(call_hemi, kid_pp, mom, mom_pp, kid_ad_ratio))
            .when(hemi_y, hl.bind(call_hemi, kid_pp, dad, dad_pp, kid_ad_ratio))
            .or_missing()
    )

    tm = tm.annotate_entries(__call=de_novo_call)
    tm = tm.filter_entries(hl.is_defined(tm.__call))
    entries = tm.entries()
    return (entries.select('__site_freq',
                           'proband',
                           'father',
                           'mother',
                           'proband_entry',
                           'father_entry',
                           'mother_entry',
                           'is_female',
                           **entries.__call)
            .rename({'__site_freq': 'prior'}))
コード例 #19
0
ファイル: backend.py プロジェクト: tpoterba/hail
 def unpersist_matrix_table(self, mt):
     return MatrixTable._from_java(self._to_java_ir(mt._mir).pyUnpersist())
コード例 #20
0
ファイル: backend.py プロジェクト: tpoterba/hail
 def persist_matrix_table(self, mt, storage_level):
     return MatrixTable._from_java(self._to_java_ir(mt._mir).pyPersist(storage_level))
コード例 #21
0
    def from_matrix_table(
            mt: MatrixTable,
            entry_field: str,
            *,
            n_partitions: Optional[int] = None,
            block_size: Optional[int] = None,
            sort_columns: bool = False
    ) -> 'DNDArray':
        if n_partitions is None:
            n_partitions = mt.n_partitions()
        if block_size is None:
            block_size = DNDArray.default_block_size
        if n_partitions == 0:
            assert mt.count_cols() == 0
            assert mt.count_rows() == 0
            t = range_table(0, 0)
            t = t.annotate(r=0, c=0, block=nd.array([]).reshape((0, 0)))
            t = t.select_globals(
                n_rows=0,
                n_cols=0,
                n_block_rows=0,
                n_block_cols=0,
                block_size=0)
            return DNDArray(t)

        assert 'r' not in mt.row
        assert 'c' not in mt.row
        assert 'block' not in mt.row

        n_rows, n_cols = mt.count()
        n_block_rows = (n_rows + block_size - 1) // block_size
        n_block_cols = (n_cols + block_size - 1) // block_size
        entries, cols, row_index, col_blocks = (Env.get_uid() for _ in range(4))

        if sort_columns:
            col_index = Env.get_uid()
            col_order = mt.add_col_index(col_index)
            col_order = col_order.key_cols_by().cols()
            col_order = col_order.select(key=col_order.row.select(*mt.col_key),
                                         index=col_order[col_index])
            col_order = col_order.collect(_localize=False)
            col_order = hl.sorted(col_order, key=lambda x: x.key)
            col_order = col_order['index'].collect()[0]
            mt = mt.choose_cols(col_order)
        else:
            col_keys = mt.col_key.collect(_localize=False)
            out_of_order = hl.range(hl.len(col_keys) - 1).map(
                lambda i: col_keys[i] > col_keys[i + 1])
            out_of_order = out_of_order.collect()[0]
            if any(out_of_order):
                raise ValueError(
                    'from_matrix_table: columns are not in sorted order. You may request a '
                    'sort with sort_columns=True.')

        mt = (mt
              .select_globals()
              .select_rows()
              .select_cols()
              .add_row_index(row_index)
              .localize_entries(entries, cols))
        # FIXME: remove when ndarray support structs
        mt = mt.annotate(**{entries: mt[entries][entry_field]})
        mt = mt.annotate(
            **{col_blocks: hl.range(n_block_cols).map(
                lambda c: hl.struct(
                    c=c,
                    entries=mt[entries][(c * block_size):((c + 1) * block_size)]))}
        )
        mt = mt.explode(col_blocks)
        mt = mt.select(row_index, **mt[col_blocks])
        mt = mt.annotate(r=hl.int(mt[row_index] // block_size))
        mt = mt.key_by(mt.r, mt.c)
        mt = mt.group_by(mt.r, mt.c).aggregate(
            entries=hl.sorted(
                hl.agg.collect(hl.struct(row_index=mt[row_index], entries=mt.entries)),
                key=lambda x: x.row_index
            ).map(lambda x: x.entries))
        mt = mt.select(block=hl.nd.array(mt.entries))
        mt = mt.select_globals(
            n_rows=n_rows,
            n_cols=n_cols,
            n_block_rows=n_block_rows,
            n_block_cols=n_block_cols,
            block_size=block_size)
        fname = new_temp_file()
        mt = mt.key_by(mt.r, mt.c)
        mt.write(fname, _codec_spec=DNDArray.fast_codec_spec)
        t = hl.read_table(fname, _intervals=[
            hl.Interval(hl.Struct(r=i, c=j),
                        hl.Struct(r=i, c=j + 1))
            for i in range(n_block_rows)
            for j in range(n_block_cols)])
        return DNDArray(t)
コード例 #22
0
 def unpersist_matrix_table(self, mt):
     return MatrixTable._from_java(mt._jmt.unpersist())
コード例 #23
0
ファイル: misc.py プロジェクト: maccum/hail
def filter_intervals(ds, intervals, keep=True) -> Union[Table, MatrixTable]:
    """Filter rows with a list of intervals.

    Examples
    --------

    Filter to loci falling within one interval:

    >>> ds_result = hl.filter_intervals(dataset, [hl.parse_locus_interval('17:38449840-38530994')])

    Remove all loci within list of intervals:

    >>> intervals = [hl.parse_locus_interval(x) for x in ['1:50M-75M', '2:START-400000', '3-22']]
    >>> ds_result = hl.filter_intervals(dataset, intervals, keep=False)

    Notes
    -----
    Based on the ``keep`` argument, this method will either restrict to points
    in the supplied interval ranges, or remove all rows in those ranges.

    When ``keep=True``, partitions that don't overlap any supplied interval
    will not be loaded at all.  This enables :func:`.filter_intervals` to be
    used for reasonably low-latency queries of small ranges of the dataset, even
    on large datasets.

    Parameters
    ----------
    ds : :class:`.MatrixTable` or :class:`.Table`
        Dataset to filter.
    intervals : :class:`.ArrayExpression` of type :py:data:`.tinterval`
        Intervals to filter on.  The point type of the interval must
        be a prefix of the partition key (when filtering a matrix
        table) or the key (when filtering a table), or equal to the
        first field of the key.
    keep : :obj:`bool`
        If ``True``, keep only rows that fall within any interval in `intervals`.
        If ``False``, keep only rows that fall outside all intervals in
        `intervals`.

    Returns
    -------
    :class:`.MatrixTable` or :class:`.Table`

    """

    if isinstance(ds, MatrixTable):
        n_pk = len(ds.partition_key)
        pk_type = ds.partition_key.dtype
    else:
        assert isinstance(ds, Table)
        if ds.key is None:
            raise TypeError("cannot filter intervals of an unkeyed Table")
        n_pk = len(ds.key)
        pk_type = ds.key.dtype

    point_type = intervals.dtype.element_type.point_type

    def is_struct_prefix(partial, full):
        if list(partial) != list(full)[:len(partial)]:
            return False
        for k, v in partial.items():
            if full[k] != v:
                return False
        return True

    if point_type == pk_type[0]:
        needs_wrapper = True
    elif isinstance(point_type, tstruct) and is_struct_prefix(
            point_type, pk_type):
        needs_wrapper = False
    else:
        raise TypeError(
            "The point type is incompatible with key type of the dataset ('{}', '{}')"
            .format(repr(point_type), repr(pk_type)))

    def wrap_input(interval):
        if interval is None:
            raise TypeError(
                "'filter_intervals' does not allow missing values in 'intervals'."
            )
        elif needs_wrapper:
            return Interval(Struct(foo=interval.start),
                            Struct(foo=interval.end), interval.includes_start,
                            interval.includes_end)
        else:
            return interval

    intervals = [wrap_input(x)._jrep for x in intervals.value]
    if isinstance(ds, MatrixTable):
        jmt = Env.hail().methods.MatrixFilterIntervals.apply(
            ds._jvds, intervals, keep)
        return MatrixTable(jmt)
    else:
        jt = Env.hail().methods.TableFilterIntervals.apply(
            ds._jt, intervals, keep)
        return Table(jt)
コード例 #24
0
ファイル: spark_backend.py プロジェクト: TileDB-Inc/hail
 def persist_matrix_table(self, mt, storage_level):
     return MatrixTable._from_java(
         self._jbackend.pyPersistMatrix(storage_level,
                                        self._to_java_matrix_ir(mt._mir)))
コード例 #25
0
ファイル: backend.py プロジェクト: Quiltomics/hail
 def unpersist_matrix_table(self, mt, storage_level):
     return MatrixTable._from_java(mt._jmt.unpersist())
コード例 #26
0
def filter_intervals(ds, intervals, keep=True) -> Union[Table, MatrixTable]:
    """Filter rows with a list of intervals.

    Examples
    --------

    Filter to loci falling within one interval:

    >>> ds_result = hl.filter_intervals(dataset, [hl.parse_locus_interval('17:38449840-38530994')])

    Remove all loci within list of intervals:

    >>> intervals = [hl.parse_locus_interval(x) for x in ['1:50M-75M', '2:START-400000', '3-22']]
    >>> ds_result = hl.filter_intervals(dataset, intervals, keep=False)

    Notes
    -----
    Based on the ``keep`` argument, this method will either restrict to points
    in the supplied interval ranges, or remove all rows in those ranges.

    When ``keep=True``, partitions that don't overlap any supplied interval
    will not be loaded at all.  This enables :func:`.filter_intervals` to be
    used for reasonably low-latency queries of small ranges of the dataset, even
    on large datasets.

    Parameters
    ----------
    ds : :class:`.MatrixTable` or :class:`.Table`
        Dataset to filter.
    intervals : :class:`.ArrayExpression` of type :py:data:`.tinterval`
        Intervals to filter on.  The point type of the interval must
        be a prefix of the key or equal to the first field of the key.
    keep : :obj:`bool`
        If ``True``, keep only rows that fall within any interval in `intervals`.
        If ``False``, keep only rows that fall outside all intervals in
        `intervals`.

    Returns
    -------
    :class:`.MatrixTable` or :class:`.Table`

    """

    if isinstance(ds, MatrixTable):
        k_type = ds.row_key.dtype
    else:
        assert isinstance(ds, Table)
        k_type = ds.key.dtype

    point_type = intervals.dtype.element_type.point_type

    def is_struct_prefix(partial, full):
        if list(partial) != list(full)[:len(partial)]:
            return False
        for k, v in partial.items():
            if full[k] != v:
                return False
        return True

    if point_type == k_type[0]:
        needs_wrapper = True
        point_type = hl.tstruct(foo=point_type)
    elif isinstance(point_type, tstruct) and is_struct_prefix(
            point_type, k_type):
        needs_wrapper = False
    else:
        raise TypeError(
            "The point type is incompatible with key type of the dataset ('{}', '{}')"
            .format(repr(point_type), repr(k_type)))

    def wrap_input(interval):
        if interval is None:
            raise TypeError(
                "'filter_intervals' does not allow missing values in 'intervals'."
            )
        elif needs_wrapper:
            return Interval(Struct(foo=interval.start),
                            Struct(foo=interval.end), interval.includes_start,
                            interval.includes_end)
        else:
            return interval

    intervals_type = intervals.dtype
    intervals = hl.eval(intervals)
    intervals = hl.tarray(hl.tinterval(point_type))._convert_to_json(
        [wrap_input(i) for i in intervals])

    if isinstance(ds, MatrixTable):
        config = {
            'name': 'MatrixFilterIntervals',
            'keyType': point_type._parsable_string(),
            'intervals': intervals,
            'keep': keep
        }
        return MatrixTable(MatrixToMatrixApply(ds._mir, config))
    else:
        config = {
            'name': 'TableFilterIntervals',
            'keyType': point_type._parsable_string(),
            'intervals': intervals,
            'keep': keep
        }
        return Table(TableToTableApply(ds._tir, config))
コード例 #27
0
def require_biallelic(dataset, method) -> MatrixTable:
    require_row_key_variant(dataset, method)
    dataset = MatrixTable(Env.hail().methods.VerifyBiallelic.apply(
        dataset._jvds, method))
    return dataset
コード例 #28
0
def vep(dataset,
        config,
        block_size=1000,
        name='vep',
        csq=False) -> MatrixTable:
    """Annotate variants with VEP.

    .. include:: ../_templates/req_tvariant.rst

    :func:`.vep` runs `Variant Effect Predictor
    <http://www.ensembl.org/info/docs/tools/vep/index.html>`__ with the `LOFTEE
    plugin <https://github.com/konradjk/loftee>`__ on the current dataset and
    adds the result as a row field.

    Examples
    --------

    Add VEP annotations to the dataset:

    >>> result = hl.vep(dataset, "data/vep.properties") # doctest: +SKIP

    Notes
    -----

    **Configuration**

    :func:`.vep` needs a configuration file to tell it
    how to run VEP. The format is a `.properties file
    <https://en.wikipedia.org/wiki/.properties>`__. Roughly, each line defines a
    property as a key-value pair of the form `key = value`. :func:`.vep` supports the
    following properties:

    - **hail.vep.perl** -- Location of Perl. Optional, default: perl.
    - **hail.vep.perl5lib** -- Value for the PERL5LIB environment variable when
      invoking VEP. Optional, by default PERL5LIB is not set.
    - **hail.vep.path** -- Value of the PATH environment variable when invoking
      VEP.  Optional, by default PATH is not set.
    - **hail.vep.location** -- Location of the VEP Perl script.  Required.
    - **hail.vep.cache_dir** -- Location of the VEP cache dir, passed to VEP
      with the ``--dir`` option. Required.
    - **hail.vep.fasta** -- Location of the FASTA file to use to look up the
      reference sequence, passed to VEP with the `--fasta` option. Required.
    - **hail.vep.assembly** -- Genome assembly version to use. Optional,
      default: GRCh37
    - **hail.vep.plugin** -- VEP plugin, passed to VEP with the `--plugin`
      option. Optional. Overrides `hail.vep.lof.human_ancestor` and
      `hail.vep.lof.conservation_file`.
    - **hail.vep.lof.human_ancestor** -- Location of the human ancestor file for
      the LOFTEE plugin. Ignored if `hail.vep.plugin` is set. Required otherwise.
    - **hail.vep.lof.conservation_file** -- Location of the conservation file
      for the LOFTEE plugin. Ignored if `hail.vep.plugin` is set. Required
      otherwise.

    Here is an example ``vep.properties`` configuration file

    .. code-block:: text

        hail.vep.perl = /usr/bin/perl
        hail.vep.path = /usr/local/bin:/usr/bin:/bin:/usr/sbin:/sbin
        hail.vep.location = /path/to/vep/ensembl-tools-release-81/scripts/variant_effect_predictor/variant_effect_predictor.pl
        hail.vep.cache_dir = /path/to/vep
        hail.vep.lof.human_ancestor = /path/to/loftee_data/human_ancestor.fa.gz
        hail.vep.lof.conservation_file = /path/to/loftee_data/phylocsf.sql

    **VEP Invocation**

    .. code-block:: text

        <hail.vep.perl>
        <hail.vep.location>
        --format vcf
        --json
        --everything
        --allele_number
        --no_stats
        --cache --offline
        --dir <hail.vep.cache_dir>
        --fasta <hail.vep.fasta>
        --minimal
        --assembly <hail.vep.assembly>
        --plugin LoF,\
        human_ancestor_fa:$<hail.vep.lof.human_ancestor>,\
        filter_position:0.05,\
        min_intron_size:15,\
        conservation_file:<hail.vep.lof.conservation_file>
        -o STDOUT

    **Annotations**

    A new row field is added in the location specified by `name` with the
    following schema:

    .. code-block:: text

        struct {
            assembly_name: str,
            allele_string: str,
            ancestral: str,
            colocated_variants: array<struct {
                aa_allele: str,
                aa_maf: float64,
                afr_allele: str,
                afr_maf: float64,
                allele_string: str,
                amr_allele: str,
                amr_maf: float64,
                clin_sig: array<str>,
                end: int32,
                eas_allele: str,
                eas_maf: float64,
                ea_allele: str,
                ea_maf: float64,
                eur_allele: str,
                eur_maf: float64,
                exac_adj_allele: str,
                exac_adj_maf: float64,
                exac_allele: str,
                exac_afr_allele: str,
                exac_afr_maf: float64,
                exac_amr_allele: str,
                exac_amr_maf: float64,
                exac_eas_allele: str,
                exac_eas_maf: float64,
                exac_fin_allele: str,
                exac_fin_maf: float64,
                exac_maf: float64,
                exac_nfe_allele: str,
                exac_nfe_maf: float64,
                exac_oth_allele: str,
                exac_oth_maf: float64,
                exac_sas_allele: str,
                exac_sas_maf: float64,
                id: str,
                minor_allele: str,
                minor_allele_freq: float64,
                phenotype_or_disease: int32,
                pubmed: array<int32>,
                sas_allele: str,
                sas_maf: float64,
                somatic: int32,
                start: int32,
                strand: int32
            }>,
            context: str,
            end: int32,
            id: str,
            input: str,
            intergenic_consequences: array<struct {
                allele_num: int32,
                consequence_terms: array<str>,
                impact: str,
                minimised: int32,
                variant_allele: str
            }>,
            most_severe_consequence: str,
            motif_feature_consequences: array<struct {
                allele_num: int32,
                consequence_terms: array<str>,
                high_inf_pos: str,
                impact: str,
                minimised: int32,
                motif_feature_id: str,
                motif_name: str,
                motif_pos: int32,
                motif_score_change: float64,
                strand: int32,
                variant_allele: str
            }>,
            regulatory_feature_consequences: array<struct {
                allele_num: int32,
                biotype: str,
                consequence_terms: array<str>,
                impact: str,
                minimised: int32,
                regulatory_feature_id: str,
                variant_allele: str
            }>,
            seq_region_name: str,
            start: int32,
            strand: int32,
            transcript_consequences: array<struct {
                allele_num: int32,
                amino_acids: str,
                biotype: str,
                canonical: int32,
                ccds: str,
                cdna_start: int32,
                cdna_end: int32,
                cds_end: int32,
                cds_start: int32,
                codons: str,
                consequence_terms: array<str>,
                distance: int32,
                domains: array<struct {
                    db: str,
                    name: str
                }>,
                exon: str,
                gene_id: str,
                gene_pheno: int32,
                gene_symbol: str,
                gene_symbol_source: str,
                hgnc_id: str,
                hgvsc: str,
                hgvsp: str,
                hgvs_offset: int32,
                impact: str,
                intron: str,
                lof: str,
                lof_flags: str,
                lof_filter: str,
                lof_info: str,
                minimised: int32,
                polyphen_prediction: str,
                polyphen_score: float64,
                protein_end: int32,
                protein_start: int32,
                protein_id: str,
                sift_prediction: str,
                sift_score: float64,
                strand: int32,
                swissprot: str,
                transcript_id: str,
                trembl: str,
                uniparc: str,
                variant_allele: str
            }>,
            variant_class: str
        }

    Parameters
    ----------
    dataset : :class:`.MatrixTable`
        Dataset.
    config : :obj:`str`
        Path to VEP configuration file.
    block_size : :obj:`int`
        Number of rows to process per VEP invocation.
    name : :obj:`str`
        Name for resulting row field.
    csq : :obj:`bool`
        If ``True``, annotates VCF CSQ field as a :py:data:`.tstr`.
        If ``False``, annotates with the full nested struct schema.

    Returns
    -------
    :class:`.MatrixTable`
        Dataset with new row-indexed field `name` containing VEP annotations.
    """

    require_row_key_variant(dataset, 'vep')
    mt = MatrixTable(Env.hail().methods.VEP.apply(dataset._jvds, config,
                                                  'va.`{}`'.format(name), csq,
                                                  block_size))
    return mt.annotate_rows(vep=mt['vep']['vep'])
コード例 #29
0
def filter_intervals(ds, intervals, keep=True) -> MatrixTable:
    """Filter rows with a list of intervals.

    Examples
    --------

    Filter to loci falling within one interval:

    >>> ds_result = hl.filter_intervals(dataset, [hl.parse_locus_interval('17:38449840-38530994')])

    Remove all loci within list of intervals:

    >>> intervals = [hl.parse_locus_interval(x) for x in ['1:50M-75M', '2:START-400000', '3-22']]
    >>> ds_result = hl.filter_intervals(dataset, intervals)

    Notes
    -----
    Based on the ``keep`` argument, this method will either restrict to points
    in the supplied interval ranges, or remove all rows in those ranges.

    When ``keep=True``, partitions that don't overlap any supplied interval
    will not be loaded at all.  This enables :func:`.filter_intervals` to be
    used for reasonably low-latency queries of small ranges of the dataset, even
    on large datasets.

    Parameters
    ----------
    ds : :class:`.MatrixTable`
        Dataset.
    intervals : :class:`.ArrayExpression` of type :py:data:`.tinterval`
        Intervals to filter on. If there is only one row partition key, the
        point type of the interval can be the type of the first partition key.
        Otherwise, the interval point type must be a :class:`.Struct` matching
        the row partition key schema.
    keep : :obj:`bool`
        If ``True``, keep only rows that fall within any interval in `intervals`.
        If ``False``, keep only rows that fall outside all intervals in
        `intervals`.

    Returns
    -------
    :class:`.MatrixTable`
    """

    n_pk = len(ds.partition_key)
    pk_type = ds.partition_key.dtype
    point_type = intervals.dtype.element_type.point_type

    if point_type == pk_type:
        needs_wrapper = False
    elif n_pk == 1 and point_type == ds.partition_key[0].dtype:
        needs_wrapper = True
    else:
        raise TypeError(
            "The point type does not match the row partition key type of the dataset ('{}', '{}')"
            .format(repr(point_type), repr(pk_type)))

    def wrap_input(interval):
        if interval is None:
            raise TypeError(
                "'filter_intervals' does not allow missing values in 'intervals'."
            )
        elif needs_wrapper:
            return Interval(Struct(foo=interval.start),
                            Struct(foo=interval.end), interval.includes_start,
                            interval.includes_end)
        else:
            return interval

    intervals = [wrap_input(x)._jrep for x in intervals.value]
    jmt = Env.hail().methods.FilterIntervals.apply(ds._jvds, intervals, keep)
    return MatrixTable(jmt)
コード例 #30
0
def nirvana(dataset, config, block_size=500000, name='nirvana') -> MatrixTable:
    """Annotate variants using `Nirvana <https://github.com/Illumina/Nirvana>`_.

    .. include:: ../_templates/experimental.rst

    .. include:: ../_templates/req_tvariant.rst

    :func:`.nirvana` runs `Nirvana
    <https://github.com/Illumina/Nirvana>`_ on the current dataset and adds a
    new row field in the location specified by `name`.

    Examples
    --------

    Add Nirvana annotations to the dataset:

    >>> result = hl.nirvana(dataset, "data/nirvana.properties") # doctest: +SKIP

    Notes
    -----

    ***Configuration***

    :func:`.nirvana` requires a configuration file. The format is a
    `.properties file <https://en.wikipedia.org/wiki/.properties>`__, where each
    line defines a property as a key-value pair of the form ``key = value``.
    :func:`.nirvana` supports the following properties:

    - **hail.nirvana.dotnet** -- Location of dotnet. Optional, default: dotnet.
    - **hail.nirvana.path** -- Value of the PATH environment variable when
      invoking Nirvana. Optional, by default PATH is not set.
    - **hail.nirvana.location** -- Location of Nirvana.dll. Required.
    - **hail.nirvana.reference** -- Location of reference genome. Required.
    - **hail.nirvana.cache** -- Location of cache. Required.
    - **hail.nirvana.supplementaryAnnotationDirectory** -- Location of
      Supplementary Database. Optional, no supplementary database by default.

    Here is an example ``nirvana.properties`` configuration file:

    .. code-block:: text

        hail.nirvana.location = /path/to/dotnet/netcoreapp1.1/Nirvana.dll
        hail.nirvana.reference = /path/to/nirvana/References/Homo_sapiens.GRCh37.Nirvana.dat
        hail.nirvana.cache = /path/to/nirvana/Cache/GRCh37/Ensembl84
        hail.nirvana.supplementaryAnnotationDirectory = /path/to/nirvana/SupplementaryDatabase/GRCh37

    **Annotations**

    A new row field is added in the location specified by `name` with the
    following schema:

    .. code-block:: text

        struct {
            chromosome: str,
            refAllele: str,
            position: int32,
            altAlleles: array<str>,
            cytogeneticBand: str,
            quality: float64,
            filters: array<str>,
            jointSomaticNormalQuality: int32,
            copyNumber: int32,
            strandBias: float64,
            recalibratedQuality: float64,
            variants: array<struct {
                altAllele: str,
                refAllele: str,
                chromosome: str,
                begin: int32,
                end: int32,
                phylopScore: float64,
                isReferenceMinor: bool,
                variantType: str,
                vid: str,
                isRecomposed: bool,
                regulatoryRegions: array<struct {
                    id: str,
                    consequence: set<str>,
                    type: str
                }>,
                clinvar: array<struct {
                    id: str,
                    reviewStatus: str,
                    isAlleleSpecific: bool,
                    alleleOrigins: array<str>,
                    refAllele: str,
                    altAllele: str,
                    phenotypes: array<str>,
                    medGenIds: array<str>,
                    omimIds: array<str>,
                    orphanetIds: array<str>,
                    geneReviewsId: str,
                    significance: str,
                    lastUpdatedDate: str,
                    pubMedIds: array<str>
                }>,
                cosmic: array<struct {
                    id: str,
                    isAlleleSpecific: bool,
                    refAllele: str,
                    altAllele: str,
                    gene: str,
                    sampleCount: int32,
                    studies: array<struct {
                        id: int32,
                        histology: str,
                        primarySite: str
                    }>
                }>,
                dbsnp: struct {
                    ids: array<str>
                },
                evs: struct {
                    coverage: int32,
                    sampleCount: int32,
                    allAf: float64,
                    afrAf: float64,
                    eurAf: float64
                },
                exac: struct {
                    coverage: int32,
                    allAf: float64,
                    allAc: int32,
                    allAn: int32,
                    afrAf: float64,
                    afrAc: int32,
                    afrAn: int32,
                    amrAf: float64,
                    amrAc: int32,
                    amrAn: int32,
                    easAf: float64,
                    easAc: int32,
                    easAn: int32,
                    finAf: float64,
                    finAc: int32,
                    finAn: int32,
                    nfeAf: float64,
                    nfeAc: int32,
                    nfeAn: int32,
                    othAf: float64,
                    othAc: int32,
                    othAn: int32,
                    sasAf: float64,
                    sasAc: int32,
                    sasAn: int32
                },
                globalAllele: struct {
                    globalMinorAllele: str,
                    globalMinorAlleleFrequency: float64
                },
                oneKg: struct {
                    ancestralAllele: str,
                    allAf: float64,
                    allAc: int32,
                    allAn: int32,
                    afrAf: float64,
                    afrAc: int32,
                    afrAn: int32,
                    amrAf: float64,
                    amrAc: int32,
                    amrAn: int32,
                    easAf: float64,
                    easAc: int32,
                    easAn: int32,
                    eurAf: float64,
                    eurAc: int32,
                    eurAn: int32,
                    sasAf: float64,
                    sasAc: int32,
                    sasAn: int32
                },
                transcripts: struct {
                    refSeq: array<struct {
                        transcript: str,
                        bioType: str,
                        aminoAcids: str,
                        cDnaPos: str,
                        codons: str,
                        cdsPos: str,
                        exons: str,
                        introns: str,
                        geneId: str,
                        hgnc: str,
                        consequence: array<str>,
                        hgvsc: str,
                        hgvsp: str,
                        isCanonical: bool,
                        polyPhenScore: float64,
                        polyPhenPrediction: str,
                        proteinId: str,
                        proteinPos: str,
                        siftScore: float64,
                        siftPrediction: str
                    }>,
                    ensembl: array<struct {
                        transcript: str,
                        bioType: str,
                        aminoAcids: str,
                        cDnaPos: str,
                        codons: str,
                        cdsPos: str,
                        exons: str,
                        introns: str,
                        geneId: str,
                        hgnc: str,
                        consequence: array<str>,
                        hgvsc: str,
                        hgvsp: str,
                        isCanonical: bool,
                        polyPhenScore: float64,
                        polyPhenPrediction: str,
                        proteinId: str,
                        proteinPos: str,
                        siftScore: float64,
                        siftPrediction: str
                    }>
                },
                genes: array<struct {
                    name: str,
                    omim: array<struct {
                        mimNumber: int32,
                        hgnc: str,
                        description: str,
                        phenotypes: array<struct {
                            mimNumber: int32,
                            phenotype: str,
                            mapping: str,
                            inheritance: array<str>,
                            comments: str
                        }>
                    }>
                }>
            }>
        }

    Parameters
    ----------
    dataset : :class:`.MatrixTable`
        Dataset.
    config : :obj:`str`
        Path to Nirvana configuration file.
    block_size : :obj:`int`
        Number of rows to process per Nirvana invocation.
    name : :obj:`str`
        Name for resulting row field.

    Returns
    -------
    :class:`.MatrixTable`
        Dataset with new row-indexed field `name` containing Nirvana annotations.
    """

    require_row_key_variant(dataset, 'nirvana')
    mt = MatrixTable(Env.hail().methods.Nirvana.apply(dataset._jvds, config,
                                                      block_size,
                                                      'va.`{}`'.format(name)))
    return mt.annotate_rows(nirvana=mt['nirvana']['nirvana'])
コード例 #31
0
ファイル: backend.py プロジェクト: shutianxu/hail
 def unpersist_matrix_table(self, mt):
     return MatrixTable._from_java(self._to_java_ir(mt._mir).pyUnpersist())
コード例 #32
0
ファイル: misc.py プロジェクト: danking/hail
def filter_intervals(ds, intervals, keep=True) -> Union[Table, MatrixTable]:
    """Filter rows with a list of intervals.

    Examples
    --------

    Filter to loci falling within one interval:

    >>> ds_result = hl.filter_intervals(dataset, [hl.parse_locus_interval('17:38449840-38530994')])

    Remove all loci within list of intervals:

    >>> intervals = [hl.parse_locus_interval(x) for x in ['1:50M-75M', '2:START-400000', '3-22']]
    >>> ds_result = hl.filter_intervals(dataset, intervals, keep=False)

    Notes
    -----
    Based on the ``keep`` argument, this method will either restrict to points
    in the supplied interval ranges, or remove all rows in those ranges.

    When ``keep=True``, partitions that don't overlap any supplied interval
    will not be loaded at all.  This enables :func:`.filter_intervals` to be
    used for reasonably low-latency queries of small ranges of the dataset, even
    on large datasets.

    Parameters
    ----------
    ds : :class:`.MatrixTable` or :class:`.Table`
        Dataset to filter.
    intervals : :class:`.ArrayExpression` of type :py:data:`.tinterval`
        Intervals to filter on.  The point type of the interval must
        be a prefix of the key or equal to the first field of the key.
    keep : :obj:`bool`
        If ``True``, keep only rows that fall within any interval in `intervals`.
        If ``False``, keep only rows that fall outside all intervals in
        `intervals`.

    Returns
    -------
    :class:`.MatrixTable` or :class:`.Table`

    """

    if isinstance(ds, MatrixTable):
        k_type = ds.row_key.dtype
    else:
        assert isinstance(ds, Table)
        k_type = ds.key.dtype

    point_type = intervals.dtype.element_type.point_type

    def is_struct_prefix(partial, full):
        if list(partial) != list(full)[:len(partial)]:
            return False
        for k, v in partial.items():
            if full[k] != v:
                return False
        return True

    if point_type == k_type[0]:
        needs_wrapper = True
    elif isinstance(point_type, tstruct) and is_struct_prefix(point_type, k_type):
        needs_wrapper = False
    else:
        raise TypeError("The point type is incompatible with key type of the dataset ('{}', '{}')".format(repr(point_type), repr(k_type)))

    def wrap_input(interval):
        if interval is None:
            raise TypeError("'filter_intervals' does not allow missing values in 'intervals'.")
        elif needs_wrapper:
            return Interval(Struct(foo=interval.start),
                            Struct(foo=interval.end),
                            interval.includes_start,
                            interval.includes_end)
        else:
            return interval

    intervals = [wrap_input(x)._jrep for x in hl.eval(intervals)]
    if isinstance(ds, MatrixTable):
        jmt = Env.hail().methods.MatrixFilterIntervals.apply(ds._jmt, intervals, keep)
        return MatrixTable._from_java(jmt)
    else:
        jt = Env.hail().methods.TableFilterIntervals.apply(ds._jt, intervals, keep)
        return Table._from_java(jt)