Exemple #1
    def __matmul__(self, right: 'DNDArray') -> 'DNDArray':
        left = self
        assert left.block_size == right.block_size
        assert left.n_cols == right.n_rows
        assert left.n_block_cols == right.n_block_rows

        n_rows = left.n_rows
        n_cols = right.n_cols
        block_size = left.block_size

        n_block_rows = left.n_block_rows
        n_block_inner = left.n_block_cols
        n_block_cols = right.n_block_cols
        n_multiplies = n_block_rows * n_block_cols * n_block_inner

        o = hl.utils.range_table(n_multiplies, n_partitions=n_multiplies)
        o = o.key_by(
            r=o.idx // (n_block_cols * n_block_inner),
            c=(o.idx % (n_block_cols * n_block_inner)) // n_block_inner,
            k=o.idx % n_block_inner
        o = o._key_by_assert_sorted('r', 'c', 'k')
        o = o._key_by_assert_sorted('r', 'k', 'c')
        o = o.annotate(left=left.m[o.r, o.k].block)
        o = o._key_by_assert_sorted('k', 'c', 'r')
        o = o.annotate(right=right.m[o.k, o.c].block)
        o = o.annotate(product=o.left @ o.right)

        # FIXME: use ndarray sum / fma
        def ndarray_to_array(ndarray):
            return hl.rbind(
                lambda n_rows, n_cols: hl.range(hl.int(n_rows * n_cols)).map(
                    lambda absolute: o.product[absolute % n_rows, absolute // n_rows]))
        o = o.annotate(shape=o.product.shape,
        o = o._key_by_assert_sorted('r', 'c', 'k')
        o = o._key_by_assert_sorted('r', 'c')

        import hail.methods.misc as misc
        misc.require_key(o, 'collect_by_key')
        import hail.ir as ir

        o = Table(ir.TableAggregateByKey(
                shape=hl.agg.take(o.shape, 1)[0],
        o = o.annotate(block=hl.nd.from_column_major(o.block, o.shape))
        o = o.select('block')
        o = o.select_globals(
        return DNDArray(o)
Exemple #2
    def _block_inner_product(self,
                             right: 'DNDArray',
                             block_product: Callable[[Expression, Expression], Expression],
                             block_aggregate: Callable[[Expression], Expression]
                             ) -> 'DNDArray':
        left = self
        assert left.block_size == right.block_size
        assert left.n_cols == right.n_rows
        assert left.n_block_cols == right.n_block_rows

        n_rows = left.n_rows
        n_cols = right.n_cols
        block_size = left.block_size

        n_block_rows = left.n_block_rows
        n_block_inner = left.n_block_cols
        n_block_cols = right.n_block_cols
        n_multiplies = n_block_rows * n_block_cols * n_block_inner

        o = hl.utils.range_table(n_multiplies, n_partitions=n_multiplies)
        o = o.key_by(
            r=o.idx // (n_block_cols * n_block_inner),
            c=(o.idx % (n_block_cols * n_block_inner)) // n_block_inner,
            k=o.idx % n_block_inner
        o = o._key_by_assert_sorted('r', 'c', 'k')
        o = o._key_by_assert_sorted('r', 'k', 'c')
        o = o.annotate(left=left.m[o.r, o.k].block)
        o = o._key_by_assert_sorted('k', 'c', 'r')
        o = o.annotate(right=right.m[o.k, o.c].block)

        o = o.annotate(product=block_product(o.left, o.right))
        o = o._key_by_assert_sorted('r', 'c', 'k')
        o = o._key_by_assert_sorted('r', 'c')

        import hail.methods.misc as misc
        misc.require_key(o, 'collect_by_key')
        import hail.ir as ir

        o = Table(ir.TableAggregateByKey(
        o = o.select('block')
        o = o.select_globals(
        return DNDArray(o)
Exemple #3
    def entries(self):
        """Returns a table with the coordinates and numeric value of each block matrix entry.

        >>> import numpy as np
        >>> block_matrix = BlockMatrix.from_numpy(np.matrix([[5, 7], [2, 8]]), 2)
        >>> entries_table = block_matrix.entries()
        >>> entries_table.show()
        |     i |     j |       entry |
        | int64 | int64 |     float64 |
        |     0 |     0 | 5.00000e+00 |
        |     0 |     1 | 7.00000e+00 |
        |     1 |     0 | 2.00000e+00 |
        |     1 |     1 | 8.00000e+00 |

        The resulting table may be filtered, aggregated, and queried, but should only be
        directly exported to disk if the block matrix is very small.

            Table with a row for each entry.
        return Table(self._jbm.entriesTable(Env.hc()._jhc))
Exemple #4
    def fit_alternatives_numpy(self, pa, a=None, return_pandas=False):
        r"""Fit and test alternative model for each augmented design matrix.

        This Python-only implementation runs serially on master. See
        the scalable implementation :meth:`fit_alternatives` for documentation
        of the returned table.

        pa: :class:`ndarray`
            Projected matrix :math:`P_r A` of alternatives with shape :math:`(r, m)`.
            Each column is a projected augmentation :math:`P_r x_\star` of :math:`P_r X`.
        a: :class:`ndarray`, optional
            Matrix :math:`A` of alternatives with shape :math:`(n, m)`.
            Each column is an augmentation :math:`x_\star` of :math:`X`.
            Required for low-rank inference.
        return_pandas: :obj:`bool`
            If true, return pandas dataframe. If false, return Hail table.

        :class:`.Table` or :class:`.pandas.DataFrame`
            Table of results for each augmented design matrix.
        self._check_dof(self.f + 1)

        if not self._fitted:
            raise Exception("null model is not fit. Run 'fit' first.")

        n_cols = pa.shape[1]
        assert pa.shape[0] == self.r

        if self.low_rank:
            assert a.shape[0] == self.n and a.shape[1] == n_cols
            data = [(i, ) + self._fit_alternative_numpy(pa[:, i], a[:, i])
                    for i in range(n_cols)]
            data = [(i, ) + self._fit_alternative_numpy(pa[:, i], None)
                    for i in range(n_cols)]

        df = pd.DataFrame.from_records(
            data, columns=['idx', 'beta', 'sigma_sq', 'chi_sq', 'p_value'])

        if return_pandas:
            return df
            return Table.from_pandas(df, key='idx')
    def fit_alternatives_numpy(self, pa, a=None, return_pandas=False):
        r"""Fit and test alternative model for each augmented design matrix.

        This Python-only implementation runs serially on master. See
        the scalable implementation :meth:`fit_alternatives` for documentation
        of the returned table.

        pa: :class:`ndarray`
            Projected matrix :math:`P_r A` of alternatives with shape :math:`(r, m)`.
            Each column is a projected augmentation :math:`P_r x_\star` of :math:`P_r X`.
        a: :class:`ndarray`, optional
            Matrix :math:`A` of alternatives with shape :math:`(n, m)`.
            Each column is an augmentation :math:`x_\star` of :math:`X`.
            Required for low-rank inference.
        return_pandas: :obj:`bool`
            If true, return pandas dataframe. If false, return Hail table.

        :class:`.Table` or :class:`.pandas.DataFrame`
            Table of results for each augmented design matrix.
        self._check_dof(self.f + 1)

        if not self._fitted:
            raise Exception("null model is not fit. Run 'fit' first.")

        n_cols = pa.shape[1]
        assert pa.shape[0] == self.r

        if self.low_rank:
            assert a.shape[0] == self.n and a.shape[1] == n_cols
            data = [(i,) + self._fit_alternative_numpy(pa[:, i], a[:, i]) for i in range(n_cols)]
            data = [(i,) + self._fit_alternative_numpy(pa[:, i], None) for i in range(n_cols)]

        df = pd.DataFrame.from_records(data, columns=['idx', 'beta', 'sigma_sq', 'chi_sq', 'p_value'])

        if return_pandas:
            return df
            return Table.from_pandas(df, key='idx')
Exemple #6
def filter_intervals(ds, intervals, keep=True) -> Union[Table, MatrixTable]:
    """Filter rows with a list of intervals.


    Filter to loci falling within one interval:

    >>> ds_result = hl.filter_intervals(dataset, [hl.parse_locus_interval('17:38449840-38530994')])

    Remove all loci within list of intervals:

    >>> intervals = [hl.parse_locus_interval(x) for x in ['1:50M-75M', '2:START-400000', '3-22']]
    >>> ds_result = hl.filter_intervals(dataset, intervals, keep=False)

    Based on the ``keep`` argument, this method will either restrict to points
    in the supplied interval ranges, or remove all rows in those ranges.

    When ``keep=True``, partitions that don't overlap any supplied interval
    will not be loaded at all.  This enables :func:`.filter_intervals` to be
    used for reasonably low-latency queries of small ranges of the dataset, even
    on large datasets.

    ds : :class:`.MatrixTable` or :class:`.Table`
        Dataset to filter.
    intervals : :class:`.ArrayExpression` of type :py:data:`.tinterval`
        Intervals to filter on.  The point type of the interval must
        be a prefix of the key or equal to the first field of the key.
    keep : :obj:`bool`
        If ``True``, keep only rows that fall within any interval in `intervals`.
        If ``False``, keep only rows that fall outside all intervals in

    :class:`.MatrixTable` or :class:`.Table`


    if isinstance(ds, MatrixTable):
        k_type = ds.row_key.dtype
        assert isinstance(ds, Table)
        k_type = ds.key.dtype

    point_type = intervals.dtype.element_type.point_type

    def is_struct_prefix(partial, full):
        if list(partial) != list(full)[:len(partial)]:
            return False
        for k, v in partial.items():
            if full[k] != v:
                return False
        return True

    if point_type == k_type[0]:
        needs_wrapper = True
        point_type = hl.tstruct(foo=point_type)
    elif isinstance(point_type, tstruct) and is_struct_prefix(
            point_type, k_type):
        needs_wrapper = False
        raise TypeError(
            "The point type is incompatible with key type of the dataset ('{}', '{}')"
            .format(repr(point_type), repr(k_type)))

    def wrap_input(interval):
        if interval is None:
            raise TypeError(
                "'filter_intervals' does not allow missing values in 'intervals'."
        elif needs_wrapper:
            return Interval(Struct(foo=interval.start),
                            Struct(foo=interval.end), interval.includes_start,
            return interval

    intervals_type = intervals.dtype
    intervals = hl.eval(intervals)
    intervals = hl.tarray(hl.tinterval(point_type))._convert_to_json(
        [wrap_input(i) for i in intervals])

    if isinstance(ds, MatrixTable):
        config = {
            'name': 'MatrixFilterIntervals',
            'keyType': point_type._parsable_string(),
            'intervals': intervals,
            'keep': keep
        return MatrixTable(MatrixToMatrixApply(ds._mir, config))
        config = {
            'name': 'TableFilterIntervals',
            'keyType': point_type._parsable_string(),
            'intervals': intervals,
            'keep': keep
        return Table(TableToTableApply(ds._tir, config))
Exemple #7
 def from_spark(self, df, key):
     return Table._from_java(Env.hail().table.Table.fromDF(
         Env.hc()._jhc, df._jdf, key))
def identity_by_descent(dataset,
                        max=None) -> Table:
    """Compute matrix of identity-by-descent estimates.

    .. include:: ../_templates/req_tstring.rst

    .. include:: ../_templates/req_tvariant.rst

    .. include:: ../_templates/req_biallelic.rst


    To calculate a full IBD matrix, using minor allele frequencies computed
    from the dataset itself:

    >>> hl.identity_by_descent(dataset)

    To calculate an IBD matrix containing only pairs of samples with
    ``PI_HAT`` in :math:`[0.2, 0.9]`, using minor allele frequencies stored in
    the row field `panel_maf`:

    >>> hl.identity_by_descent(dataset, maf=dataset['panel_maf'], min=0.2, max=0.9)


    The dataset must have a column field named `s` which is a :class:`.StringExpression`
    and which uniquely identifies a column.

    The implementation is based on the IBD algorithm described in the `PLINK
    paper <http://www.ncbi.nlm.nih.gov/pmc/articles/PMC1950838>`__.

    :func:`.identity_by_descent` requires the dataset to be biallelic and does
    not perform LD pruning. Linkage disequilibrium may bias the result so
    consider filtering variants first.

    The resulting :class:`.Table` entries have the type: *{ i: String,
    j: String, ibd: { Z0: Double, Z1: Double, Z2: Double, PI_HAT: Double },
    ibs0: Long, ibs1: Long, ibs2: Long }*. The key list is: `*i: String, j:

    Conceptually, the output is a symmetric, sample-by-sample matrix. The
    output table has the following form

    .. code-block:: text

        i		j	ibd.Z0	ibd.Z1	ibd.Z2	ibd.PI_HAT ibs0	ibs1	ibs2
        sample1	sample2	1.0000	0.0000	0.0000	0.0000 ...
        sample1	sample3	1.0000	0.0000	0.0000	0.0000 ...
        sample1	sample4	0.6807	0.0000	0.3193	0.3193 ...
        sample1	sample5	0.1966	0.0000	0.8034	0.8034 ...

    dataset : :class:`.MatrixTable`
        Variant-keyed and sample-keyed :class:`.MatrixTable` containing genotype information.
    maf : :class:`.Float64Expression`, optional
        Row-indexed expression for the minor allele frequency.
    bounded : :obj:`bool`
        Forces the estimations for `Z0``, ``Z1``, ``Z2``, and ``PI_HAT`` to take
        on biologically meaningful values (in the range [0,1]).
    min : :obj:`float` or :obj:`None`
        Sample pairs with a ``PI_HAT`` below this value will
        not be included in the output. Must be in :math:`[0,1]`.
    max : :obj:`float` or :obj:`None`
        Sample pairs with a ``PI_HAT`` above this value will
        not be included in the output. Must be in :math:`[0,1]`.


    require_col_key_str(dataset, 'identity_by_descent')

    if maf is not None:
        analyze('identity_by_descent/maf', maf, dataset._row_indices)
        dataset = dataset.select_rows(__maf=maf)
        dataset = dataset.select_rows()
    dataset = dataset.select_cols().select_globals().select_entries('GT')
    dataset = require_biallelic(dataset, 'ibd')

    return Table(
            dataset._mir, {
                'name': 'IBD',
                'mafFieldName': '__maf' if maf is not None else None,
                'bounded': bounded,
                'min': min,
                'max': max,
Exemple #9
 def unpersist_table(self, t):
     return Table._from_java(self._to_java_ir(t._tir).pyUnpersist())
Exemple #10
 def from_pandas(self, df, key):
     return Table.from_spark(Env.spark_session().createDataFrame(df), key)
Exemple #11
def filter_intervals(ds, intervals, keep=True) -> Union[Table, MatrixTable]:
    """Filter rows with a list of intervals.


    Filter to loci falling within one interval:

    >>> ds_result = hl.filter_intervals(dataset, [hl.parse_locus_interval('17:38449840-38530994')])

    Remove all loci within list of intervals:

    >>> intervals = [hl.parse_locus_interval(x) for x in ['1:50M-75M', '2:START-400000', '3-22']]
    >>> ds_result = hl.filter_intervals(dataset, intervals, keep=False)

    Based on the ``keep`` argument, this method will either restrict to points
    in the supplied interval ranges, or remove all rows in those ranges.

    When ``keep=True``, partitions that don't overlap any supplied interval
    will not be loaded at all.  This enables :func:`.filter_intervals` to be
    used for reasonably low-latency queries of small ranges of the dataset, even
    on large datasets.

    ds : :class:`.MatrixTable` or :class:`.Table`
        Dataset to filter.
    intervals : :class:`.ArrayExpression` of type :py:data:`.tinterval`
        Intervals to filter on.  The point type of the interval must
        be a prefix of the key or equal to the first field of the key.
    keep : :obj:`bool`
        If ``True``, keep only rows that fall within any interval in `intervals`.
        If ``False``, keep only rows that fall outside all intervals in

    :class:`.MatrixTable` or :class:`.Table`


    if isinstance(ds, MatrixTable):
        k_type = ds.row_key.dtype
        assert isinstance(ds, Table)
        k_type = ds.key.dtype

    point_type = intervals.dtype.element_type.point_type

    def is_struct_prefix(partial, full):
        if list(partial) != list(full)[:len(partial)]:
            return False
        for k, v in partial.items():
            if full[k] != v:
                return False
        return True

    if point_type == k_type[0]:
        needs_wrapper = True
    elif isinstance(point_type, tstruct) and is_struct_prefix(point_type, k_type):
        needs_wrapper = False
        raise TypeError("The point type is incompatible with key type of the dataset ('{}', '{}')".format(repr(point_type), repr(k_type)))

    def wrap_input(interval):
        if interval is None:
            raise TypeError("'filter_intervals' does not allow missing values in 'intervals'.")
        elif needs_wrapper:
            return Interval(Struct(foo=interval.start),
            return interval

    intervals = [wrap_input(x)._jrep for x in hl.eval(intervals)]
    if isinstance(ds, MatrixTable):
        jmt = Env.hail().methods.MatrixFilterIntervals.apply(ds._jmt, intervals, keep)
        return MatrixTable._from_java(jmt)
        jt = Env.hail().methods.TableFilterIntervals.apply(ds._jt, intervals, keep)
        return Table._from_java(jt)
Exemple #12
    def fit_alternatives(self, pa_t_path, a_t_path=None, partition_size=None):
        r"""Fit and test alternative model for each augmented design matrix in parallel.

        The alternative model is fit using REML constrained to the value of
        :math:`\gamma` set by :meth:`fit`.

        The likelihood ratio test of fixed effect parameter :math:`\beta_\star`
        uses (non-restricted) maximum likelihood:

        .. math::

          \chi^2 = 2 \log\left(\frac{
          \max_{\beta_\star, \beta, \sigma^2}\mathrm{N}
          (y \, | \, x_\star \beta_\star + X \beta; \sigma^2(K + \gamma^{-1}I)}
          {\max_{\beta, \sigma^2} \mathrm{N}
          (y \, | \, x_\star \cdot 0 + X \beta; \sigma^2(K + \gamma^{-1}I)}

        The p-value is given by the tail probability under a chi-squared
        distribution with one degree of freedom.

        The resulting table has the following fields:

        .. list-table::
          :header-rows: 1

          * - Field
            - Type
            - Value
          * - `idx`
            - int64
            - Index of augmented design matrix.
          * - `beta`
            - float64
            - :math:`\beta_\star`
          * - `sigma_sq`
            - float64
            - :math:`\sigma^2`
          * - `chi_sq`
            - float64
            - :math:`\chi^2`
          * - `p_value`
            - float64
            - p-value

        :math:`(P_r A)^T` and :math:`A^T` (if given) must have the same number
        of rows (augmentations). These rows are grouped into partitions for
        parallel processing. The number of partitions equals the ceiling of
        ``n_rows / partition_size``, and should be at least the number or cores
        to make use of all cores. By default, there is one partition per row of
        blocks in :math:`(P_r A)^T`. Setting the partition size to an exact
        (rather than approximate) divisor or multiple of the block size reduces
        superfluous shuffling of data.

        The number of columns in each block matrix must be less than :math:`2^{31}`.

        The block matrices must be stored in row-major format, as results
        from :meth:`.BlockMatrix.write` with ``force_row_major=True`` and from
        :meth:`.BlockMatrix.write_from_entry_expr`. Otherwise, this method
        will produce an error message.

        pa_t_path: :obj:`str`
            Path to block matrix :math:`(P_r A)^T` with shape :math:`(m, r)`.
            Each row is a projected augmentation :math:`P_r x_\star` of :math:`P_r X`.
        a_t_path: :obj:`str`, optional
            Path to block matrix :math:`A^T` with shape :math:`(m, n)`.
            Each row is an augmentation :math:`x_\star` of :math:`X`.
            Include for low-rank inference.
        partition_size: :obj:`int`, optional
            Number of rows to process per partition.
            Default given by block size of :math:`(P_r A)^T`.

            Table of results for each augmented design matrix.
        from hail.table import Table

        self._check_dof(self.f + 1)

        if self.low_rank and a_t_path is None:
            raise ValueError('model is low-rank so a_t is required.')
        elif not (self.low_rank or a_t_path is None):
            raise ValueError('model is full-rank so a_t must not be set.')

        if self._scala_model is None:

        if partition_size is None:
            block_size = Env.hail().linalg.BlockMatrix.readMetadata(Env.hc()._jhc, pa_t_path).blockSize()
            partition_size = block_size
        elif partition_size <= 0:
            raise ValueError(f'partition_size must be positive, found {partition_size}')

        jpa_t = Env.hail().linalg.RowMatrix.readBlockMatrix(Env.hc()._jhc, pa_t_path, jsome(partition_size))

        if a_t_path is None:
            maybe_ja_t = jnone()
            maybe_ja_t = jsome(
                Env.hail().linalg.RowMatrix.readBlockMatrix(Env.hc()._jhc, a_t_path, jsome(partition_size)))

        return Table(self._scala_model.fit(jpa_t, maybe_ja_t))
Exemple #13
def nirvana(dataset: Union[MatrixTable, Table], config, block_size=500000, name='nirvana'):
    """Annotate variants using `Nirvana <https://github.com/Illumina/Nirvana>`_.

    .. include:: ../_templates/experimental.rst

    .. include:: ../_templates/req_tvariant.rst

    :func:`.nirvana` runs `Nirvana
    <https://github.com/Illumina/Nirvana>`_ on the current dataset and adds a
    new row field in the location specified by `name`.


    Add Nirvana annotations to the dataset:

    >>> result = hl.nirvana(dataset, "data/nirvana.properties") # doctest: +SKIP


    :func:`.nirvana` requires a configuration file. The format is a
    `.properties file <https://en.wikipedia.org/wiki/.properties>`__, where each
    line defines a property as a key-value pair of the form ``key = value``.
    :func:`.nirvana` supports the following properties:

    - **hail.nirvana.dotnet** -- Location of dotnet. Optional, default: dotnet.
    - **hail.nirvana.path** -- Value of the PATH environment variable when
      invoking Nirvana. Optional, by default PATH is not set.
    - **hail.nirvana.location** -- Location of Nirvana.dll. Required.
    - **hail.nirvana.reference** -- Location of reference genome. Required.
    - **hail.nirvana.cache** -- Location of cache. Required.
    - **hail.nirvana.supplementaryAnnotationDirectory** -- Location of
      Supplementary Database. Optional, no supplementary database by default.

    Here is an example ``nirvana.properties`` configuration file:

    .. code-block:: text

        hail.nirvana.location = /path/to/dotnet/netcoreapp2.0/Nirvana.dll
        hail.nirvana.reference = /path/to/nirvana/References/Homo_sapiens.GRCh37.Nirvana.dat
        hail.nirvana.cache = /path/to/nirvana/Cache/GRCh37/Ensembl
        hail.nirvana.supplementaryAnnotationDirectory = /path/to/nirvana/SupplementaryDatabase/GRCh37


    A new row field is added in the location specified by `name` with the
    following schema:

    .. code-block:: text

        struct {
            chromosome: str,
            refAllele: str,
            position: int32,
            altAlleles: array<str>,
            cytogeneticBand: str,
            quality: float64,
            filters: array<str>,
            jointSomaticNormalQuality: int32,
            copyNumber: int32,
            strandBias: float64,
            recalibratedQuality: float64,
            variants: array<struct {
                altAllele: str,
                refAllele: str,
                chromosome: str,
                begin: int32,
                end: int32,
                phylopScore: float64,
                isReferenceMinor: bool,
                variantType: str,
                vid: str,
                hgvsg: str,
                isRecomposedVariant: bool,
                isDecomposedVariant: bool,
                regulatoryRegions: array<struct {
                    id: str,
                    type: str,
                    consequence: set<str>
                clinvar: array<struct {
                    id: str,
                    reviewStatus: str,
                    isAlleleSpecific: bool,
                    alleleOrigins: array<str>,
                    refAllele: str,
                    altAllele: str,
                    phenotypes: array<str>,
                    medGenIds: array<str>,
                    omimIds: array<str>,
                    orphanetIds: array<str>,
                    significance: str,
                    lastUpdatedDate: str,
                    pubMedIds: array<str>
                cosmic: array<struct {
                    id: str,
                    isAlleleSpecific: bool,
                    refAllele: str,
                    altAllele: str,
                    gene: str,
                    sampleCount: int32,
                    studies: array<struct {
                        id: int32,
                        histology: str,
                        primarySite: str
                dbsnp: struct {
                    ids: array<str>
                globalAllele: struct {
                    globalMinorAllele: str,
                    globalMinorAlleleFrequency: float64
                gnomad: struct {
                    coverage: str,
                    allAf: float64,
                    allAc: int32,
                    allAn: int32,
                    allHc: int32,
                    afrAf: float64,
                    afrAc: int32,
                    afrAn: int32,
                    afrHc: int32,
                    amrAf: float64,
                    amrAc: int32,
                    amrAn: int32,
                    amrHc: int32,
                    easAf: float64,
                    easAc: int32,
                    easAn: int32,
                    easHc: int32,
                    finAf: float64,
                    finAc: int32,
                    finAn: int32,
                    finHc: int32,
                    nfeAf: float64,
                    nfeAc: int32,
                    nfeAn: int32,
                    nfeHc: int32,
                    othAf: float64,
                    othAc: int32,
                    othAn: int32,
                    othHc: int32,
                    asjAf: float64,
                    asjAc: int32,
                    asjAn: int32,
                    asjHc: int32,
                    failedFilter: bool
                gnomadExome: struct {
                    coverage: str,
                    allAf: float64,
                    allAc: int32,
                    allAn: int32,
                    allHc: int32,
                    afrAf: float64,
                    afrAc: int32,
                    afrAn: int32,
                    afrHc: int32,
                    amrAf: float64,
                    amrAc: int32,
                    amrAn: int32,
                    amrHc: int32,
                    easAf: float64,
                    easAc: int32,
                    easAn: int32,
                    easHc: int32,
                    finAf: float64,
                    finAc: int32,
                    finAn: int32,
                    finHc: int32,
                    nfeAf: float64,
                    nfeAc: int32,
                    nfeAn: int32,
                    nfeHc: int32,
                    othAf: float64,
                    othAc: int32,
                    othAn: int32,
                    othHc: int32,
                    asjAf: float64,
                    asjAc: int32,
                    asjAn: int32,
                    asjHc: int32,
                    sasAf: float64,
                    sasAc: int32,
                    sasAn: int32,
                    sasHc: int32,
                    failedFilter: bool
                topmed: struct {
                    failedFilter: bool,
                    allAc: int32,
                    allAn: int32,
                    allAf: float64,
                    allHc: int32
                oneKg: struct {
                    ancestralAllele: str,
                    allAf: float64,
                    allAc: int32,
                    allAn: int32,
                    afrAf: float64,
                    afrAc: int32,
                    afrAn: int32,
                    amrAf: float64,
                    amrAc: int32,
                    amrAn: int32,
                    easAf: float64,
                    easAc: int32,
                    easAn: int32,
                    eurAf: float64,
                    eurAc: int32,
                    eurAn: int32,
                    sasAf: float64,
                    sasAc: int32,
                    sasAn: int32
                mitomap: array<struct {
                    refAllele: str,
                    altAllele: str,
                    diseases : array<str>,
                    hasHomoplasmy: bool,
                    hasHeteroplasmy: bool,
                    status: str,
                    clinicalSignificance: str,
                    scorePercentile: float64,
                    isAlleleSpecific: bool,
                    chromosome: str,
                    begin: int32,
                    end: int32,
                    variantType: str
                transcripts: struct {
                    refSeq: array<struct {
                        transcript: str,
                        bioType: str,
                        aminoAcids: str,
                        cdnaPos: str,
                        codons: str,
                        cdsPos: str,
                        exons: str,
                        introns: str,
                        geneId: str,
                        hgnc: str,
                        consequence: array<str>,
                        hgvsc: str,
                        hgvsp: str,
                        isCanonical: bool,
                        polyPhenScore: float64,
                        polyPhenPrediction: str,
                        proteinId: str,
                        proteinPos: str,
                        siftScore: float64,
                        siftPrediction: str
                    ensembl: array<struct {
                        transcript: str,
                        bioType: str,
                        aminoAcids: str,
                        cdnaPos: str,
                        codons: str,
                        cdsPos: str,
                        exons: str,
                        introns: str,
                        geneId: str,
                        hgnc: str,
                        consequence: array<str>,
                        hgvsc: str,
                        hgvsp: str,
                        isCanonical: bool,
                        polyPhenScore: float64,
                        polyPhenPrediction: str,
                        proteinId: str,
                        proteinPos: str,
                        siftScore: float64,
                        siftPrediction: str
                overlappingGenes: array<str>
            genes: array<struct {
                name: str,
                omim: array<struct {
                    mimNumber: int32,
                    hgnc: str,
                    description: str,
                    phenotypes: array<struct {
                        mimNumber: int32,
                        phenotype: str,
                        mapping: str,
                        inheritance: array<str>,
                        comments: str
                exac: struct {
                    pLi: float64,
                    pRec: float64,
                    pNull: float64

    dataset : :class:`.MatrixTable` or :class:`.Table`
    config : :obj:`str`
        Path to Nirvana configuration file.
    block_size : :obj:`int`
        Number of rows to process per Nirvana invocation.
    name : :obj:`str`
        Name for resulting row field.

    :class:`.MatrixTable` or :class:`.Table`
        Dataset with new row-indexed field `name` containing Nirvana annotations.
    if isinstance(dataset, MatrixTable):
        require_row_key_variant(dataset, 'nirvana')
        ht = dataset.select_rows().rows()
        require_table_key_variant(dataset, 'nirvana')
        ht = dataset.select()

    annotations = Table(TableToTableApply(ht._tir,
                                          {'name': 'Nirvana',
                                           'config': config,
                                           'blockSize': block_size}

    if isinstance(dataset, MatrixTable):
        return dataset.annotate_rows(**{name: annotations[dataset.row_key].nirvana})
        return dataset.annotate(**{name: annotations[dataset.key].nirvana})
Exemple #14
 def from_spark(self, df, key):
     return Table._from_java(self._jbackend.pyFromDF(df._jdf, key))
Exemple #15
    def fit_alternatives(self, pa_t_path, a_t_path=None, partition_size=None):
        r"""Fit and test alternative model for each augmented design matrix in parallel.

        The alternative model is fit using REML constrained to the value of
        :math:`\gamma` set by :meth:`fit`.

        The likelihood ratio test of fixed effect parameter :math:`\beta_\star`
        uses (non-restricted) maximum likelihood:

        .. math::

          \chi^2 = 2 \log\left(\frac{
          \max_{\beta_\star, \beta, \sigma^2}\mathrm{N}
          (y \, | \, x_\star \beta_\star + X \beta; \sigma^2(K + \gamma^{-1}I)}
          {\max_{\beta, \sigma^2} \mathrm{N}
          (y \, | \, x_\star \cdot 0 + X \beta; \sigma^2(K + \gamma^{-1}I)}

        The p-value is given by the tail probability under a chi-squared
        distribution with one degree of freedom.

        The resulting table has the following fields:

        .. list-table::
          :header-rows: 1

          * - Field
            - Type
            - Value
          * - `idx`
            - int64
            - Index of augmented design matrix.
          * - `beta`
            - float64
            - :math:`\beta_\star`
          * - `sigma_sq`
            - float64
            - :math:`\sigma^2`
          * - `chi_sq`
            - float64
            - :math:`\chi^2`
          * - `p_value`
            - float64
            - p-value

        :math:`(P_r A)^T` and :math:`A^T` (if given) must have the same number
        of rows (augmentations). These rows are grouped into partitions for
        parallel processing. The number of partitions equals the ceiling of
        ``n_rows / partition_size``, and should be at least the number or cores
        to make use of all cores. By default, there is one partition per row of
        blocks in :math:`(P_r A)^T`. Setting the partition size to an exact
        (rather than approximate) divisor or multiple of the block size reduces
        superfluous shuffling of data.

        The number of columns in each block matrix must be less than :math:`2^{31}`.

        The block matrices must be stored in row-major format, as results
        from :meth:`.BlockMatrix.write` with ``force_row_major=True`` and from
        :meth:`.BlockMatrix.write_from_entry_expr`. Otherwise, this method
        will produce an error message.

        pa_t_path: :obj:`str`
            Path to block matrix :math:`(P_r A)^T` with shape :math:`(m, r)`.
            Each row is a projected augmentation :math:`P_r x_\star` of :math:`P_r X`.
        a_t_path: :obj:`str`, optional
            Path to block matrix :math:`A^T` with shape :math:`(m, n)`.
            Each row is an augmentation :math:`x_\star` of :math:`X`.
            Include for low-rank inference.
        partition_size: :obj:`int`, optional
            Number of rows to process per partition.
            Default given by block size of :math:`(P_r A)^T`.

            Table of results for each augmented design matrix.
        from hail.table import Table

        self._check_dof(self.f + 1)

        if self.low_rank and a_t_path is None:
            raise ValueError('model is low-rank so a_t is required.')
        elif not (self.low_rank or a_t_path is None):
            raise ValueError('model is full-rank so a_t must not be set.')

        if self._scala_model is None:

        if partition_size is None:
            block_size = Env.hail().linalg.BlockMatrix.readMetadata(Env.hc()._jhc, pa_t_path).blockSize()
            partition_size = block_size
        elif partition_size <= 0:
            raise ValueError(f'partition_size must be positive, found {partition_size}')

        jpa_t = Env.hail().linalg.RowMatrix.readBlockMatrix(Env.hc()._jhc, pa_t_path, jsome(partition_size))

        if a_t_path is None:
            maybe_ja_t = jnone()
            maybe_ja_t = jsome(
                Env.hail().linalg.RowMatrix.readBlockMatrix(Env.hc()._jhc, a_t_path, jsome(partition_size)))

        return Table._from_java(self._scala_model.fit(jpa_t, maybe_ja_t))
Exemple #16
def concordance(left, right) -> Tuple[List[List[int]], Table, Table]:
    """Calculate call concordance with another dataset.

    .. include:: ../_templates/req_tvariant.rst

    .. include:: ../_templates/req_biallelic.rst

    .. include:: ../_templates/req_unphased_diploid_gt.rst


    Compute concordance between two datasets and output the global concordance
    statistics and two tables with concordance computed per column key and per
    row key:

    >>> global_conc, cols_conc, rows_conc = hl.concordance(dataset, dataset2)


    This method computes the genotype call concordance (from the entry
    field **GT**) between two biallelic variant datasets.  It requires
    unique sample IDs and performs an inner join on samples (only
    samples in both datasets will be considered). In addition, all genotype
    calls must be **diploid** and **unphased**.

    It performs an ordered zip join of the variants.  That means the
    variants of each dataset are sorted, with duplicate variants
    appearing in some random relative order, and then zipped together.
    When a variant appears a different number of times between the two
    datasets, the dataset with the fewer number of instances is padded
    with "no data".  For example, if a variant is only in one dataset,
    then each genotype is treated as "no data" in the other.

    This method returns a tuple of three objects: a nested list of
    list of int with global concordance summary statistics, a table
    with concordance statistics per column key, and a table with
    concordance statistics per row key.

    **Using the global summary result**

    The global summary is a list of list of int (conceptually a 5 by 5 matrix),
    where the indices have special meaning:

    0. No Data (missing variant)
    1. No Call (missing genotype call)
    2. Hom Ref
    3. Heterozygous
    4. Hom Var

    The first index is the state in the left dataset and the second index is
    the state in the right dataset. Typical uses of the summary list are shown

    >>> summary, samples, variants = hl.concordance(dataset, dataset2)
    >>> left_homref_right_homvar = summary[2][4]
    >>> left_het_right_missing = summary[3][1]
    >>> left_het_right_something_else = sum(summary[3][:]) - summary[3][3]
    >>> total_concordant = summary[2][2] + summary[3][3] + summary[4][4]
    >>> total_discordant = sum([sum(s[2:]) for s in summary[2:]]) - total_concordant

    **Using the table results**

    Table 1: Concordance statistics by column

    This table contains the column key field of `left`, and the following fields:

        - `n_discordant` (:py:data:`.tint64`) -- Count of discordant calls (see below for
          full definition).
        - `concordance` (:class:`.tarray` of :class:`.tarray` of :py:data:`.tint64`) --
          Array of concordance per state on left and right, matching the structure of
          the global summary defined above.

    Table 2: Concordance statistics by row

    This table contains the row key fields of `left`, and the following fields:

        - `n_discordant` (:py:data:`.tfloat64`) -- Count of discordant calls (see below for
          full definition).
        - `concordance` (:class:`.tarray` of :class:`.tarray` of :py:data:`.tint64`) --
          Array of concordance per state on left and right, matching the structure of the
          global summary defined above.

    In these tables, the column **n_discordant** is provided as a convenience,
    because this is often one of the most useful concordance statistics. This
    value is the number of genotypes which were called (homozygous reference,
    heterozygous, or homozygous variant) in both datasets, but where the call
    did not match between the two.

    The column `concordance` matches the structure of the global summmary,
    which is detailed above. Once again, the first index into this array is the
    state on the left, and the second index is the state on the right. For
    example, ``concordance[1][4]`` is the number of "no call" genotypes on the
    left that were called homozygous variant on the right.

    left : :class:`.MatrixTable`
        First dataset to compare.
    right : :class:`.MatrixTable`
        Second dataset to compare.

    (list of list of int, :class:`.Table`, :class:`.Table`)
        The global concordance statistics, a table with concordance statistics
        per column key, and a table with concordance statistics per row key.


    require_col_key_str(left, 'concordance, left')
    require_col_key_str(right, 'concordance, right')
    left = left.select_rows().select_cols().select_globals().select_entries('GT')
    right = right.select_rows().select_cols().select_globals().select_entries('GT')
    left = require_biallelic(left, "concordance, left")
    right = require_biallelic(right, "concordance, right")

    r = Env.hail().methods.CalculateConcordance.pyApply(
    j_global_conc = r._1()
    col_conc = Table._from_java(r._2())
    row_conc = Table._from_java(r._3())
    global_conc = [[j_global_conc.apply(j).apply(i) for i in range(5)] for j in range(5)]

    return global_conc, col_conc, row_conc
Exemple #17
 def _filtered_entries_table(self, table, radius, include_diagonal):
     return Table(self._jbm.filteredEntriesTable(table._jt, radius, include_diagonal))
Exemple #18
def filter_intervals(ds, intervals, keep=True) -> Union[Table, MatrixTable]:
    """Filter rows with a list of intervals.


    Filter to loci falling within one interval:

    >>> ds_result = hl.filter_intervals(dataset, [hl.parse_locus_interval('17:38449840-38530994')])

    Remove all loci within list of intervals:

    >>> intervals = [hl.parse_locus_interval(x) for x in ['1:50M-75M', '2:START-400000', '3-22']]
    >>> ds_result = hl.filter_intervals(dataset, intervals, keep=False)

    Based on the ``keep`` argument, this method will either restrict to points
    in the supplied interval ranges, or remove all rows in those ranges.

    When ``keep=True``, partitions that don't overlap any supplied interval
    will not be loaded at all.  This enables :func:`.filter_intervals` to be
    used for reasonably low-latency queries of small ranges of the dataset, even
    on large datasets.

    ds : :class:`.MatrixTable` or :class:`.Table`
        Dataset to filter.
    intervals : :class:`.ArrayExpression` of type :py:data:`.tinterval`
        Intervals to filter on.  The point type of the interval must
        be a prefix of the partition key (when filtering a matrix
        table) or the key (when filtering a table), or equal to the
        first field of the key.
    keep : :obj:`bool`
        If ``True``, keep only rows that fall within any interval in `intervals`.
        If ``False``, keep only rows that fall outside all intervals in

    :class:`.MatrixTable` or :class:`.Table`


    if isinstance(ds, MatrixTable):
        n_pk = len(ds.partition_key)
        pk_type = ds.partition_key.dtype
        assert isinstance(ds, Table)
        if ds.key is None:
            raise TypeError("cannot filter intervals of an unkeyed Table")
        n_pk = len(ds.key)
        pk_type = ds.key.dtype

    point_type = intervals.dtype.element_type.point_type

    def is_struct_prefix(partial, full):
        if list(partial) != list(full)[:len(partial)]:
            return False
        for k, v in partial.items():
            if full[k] != v:
                return False
        return True

    if point_type == pk_type[0]:
        needs_wrapper = True
    elif isinstance(point_type, tstruct) and is_struct_prefix(
            point_type, pk_type):
        needs_wrapper = False
        raise TypeError(
            "The point type is incompatible with key type of the dataset ('{}', '{}')"
            .format(repr(point_type), repr(pk_type)))

    def wrap_input(interval):
        if interval is None:
            raise TypeError(
                "'filter_intervals' does not allow missing values in 'intervals'."
        elif needs_wrapper:
            return Interval(Struct(foo=interval.start),
                            Struct(foo=interval.end), interval.includes_start,
            return interval

    intervals = [wrap_input(x)._jrep for x in intervals.value]
    if isinstance(ds, MatrixTable):
        jmt = Env.hail().methods.MatrixFilterIntervals.apply(
            ds._jvds, intervals, keep)
        return MatrixTable(jmt)
        jt = Env.hail().methods.TableFilterIntervals.apply(
            ds._jt, intervals, keep)
        return Table(jt)
Exemple #19
 def variable_importance(self):
     return Table._from_java(self._jrf_model.variableImportance())
Exemple #20
def pca(entry_expr,
        compute_loadings=False) -> Tuple[List[float], Table, Table]:
    r"""Run principal component analysis (PCA) on numeric columns derived from a
    matrix table.


    For a matrix table with variant rows, sample columns, and genotype entries,
    compute the top 2 PC sample scores and eigenvalues of the matrix of 0s and
    1s encoding missingness of genotype calls.

    >>> eigenvalues, scores, _ = hl.pca(hl.int(hl.is_defined(dataset.GT)),
    ...                                 k=2)

      This method does **not** automatically mean-center or normalize each column.
      If desired, such transformations should be incorporated in `entry_expr`.

      Hail will return an error if `entry_expr` evaluates to missing, nan, or
      infinity on any entry.


    PCA is run on the columns of the numeric matrix obtained by evaluating
    `entry_expr` on each entry of the matrix table, or equivalently on the rows
    of the **transposed** numeric matrix :math:`M` referenced below.

    PCA computes the SVD

    .. math::

      M = USV^T

    where columns of :math:`U` are left singular vectors (orthonormal in
    :math:`\mathbb{R}^n`), columns of :math:`V` are right singular vectors
    (orthonormal in :math:`\mathbb{R}^m`), and :math:`S=\mathrm{diag}(s_1, s_2,
    \ldots)` with ordered singular values :math:`s_1 \ge s_2 \ge \cdots \ge 0`.
    Typically one computes only the first :math:`k` singular vectors and values,
    yielding the best rank :math:`k` approximation :math:`U_k S_k V_k^T` of
    :math:`M`; the truncations :math:`U_k`, :math:`S_k` and :math:`V_k` are
    :math:`n \times k`, :math:`k \times k` and :math:`m \times k`

    From the perspective of the rows of :math:`M` as samples (data points),
    :math:`V_k` contains the loadings for the first :math:`k` PCs while
    :math:`MV_k = U_k S_k` contains the first :math:`k` PC scores of each
    sample. The loadings represent a new basis of features while the scores
    represent the projected data on those features. The eigenvalues of the Gramian
    :math:`MM^T` are the squares of the singular values :math:`s_1^2, s_2^2,
    \ldots`, which represent the variances carried by the respective PCs. By
    default, Hail only computes the loadings if the ``loadings`` parameter is

    Scores are stored in a :class:`.Table` with the column key of the matrix
    table as key and a field `scores` of type ``array<float64>`` containing
    the principal component scores.

    Loadings are stored in a :class:`.Table` with the row key of the matrix
    table as key and a field `loadings` of type ``array<float64>`` containing
    the principal component loadings.

    The eigenvalues are returned in descending order, with scores and loadings
    given the corresponding array order.

    entry_expr : :class:`.Expression`
        Numeric expression for matrix entries.
    k : :obj:`int`
        Number of principal components.
    compute_loadings : :obj:`bool`
        If ``True``, compute row loadings.

    (:obj:`list` of :obj:`float`, :class:`.Table`, :class:`.Table`)
        List of eigenvalues, table with column scores, table with row loadings.
    check_entry_indexed('pca/entry_expr', entry_expr)

    mt = matrix_table_source('pca/entry_expr', entry_expr)

    #  FIXME: remove once select_entries on a field is free
    if entry_expr in mt._fields_inverse:
        field = mt._fields_inverse[entry_expr]
        field = Env.get_uid()
        mt = mt.select_entries(**{field: entry_expr})
    mt = mt.select_cols().select_rows().select_globals()

    t = (Table(
            mt._mir, {
                'name': 'PCA',
                'entryField': field,
                'k': k,
                'computeLoadings': compute_loadings

    g = t.index_globals()
    scores = hl.Table.parallelize(g.scores, key=list(mt.col_key))
    if not compute_loadings:
        t = None
    return hl.eval(g.eigenvalues), scores, None if t is None else t.drop(
        'eigenvalues', 'scores')
Exemple #21
 def unpersist_table(self, ht):
     return Table._from_java(ht._jt.unpersist())
Exemple #22
 def from_spark(self, df, key):
     return Table._from_java(Env.jutils().pyFromDF(df._jdf, key))
Exemple #23
def maximal_independent_set(i, j, keep=True, tie_breaker=None) -> Table:
    """Return a table containing the vertices in a near
    `maximal independent set <https://en.wikipedia.org/wiki/Maximal_independent_set>`_
    of an undirected graph whose edges are given by a two-column table.

    Run PC-relate and compute pairs of closely related individuals:

    >>> pc_rel = hl.pc_relate(dataset.GT, 0.001, k=2, statistics='kin')
    >>> pairs = pc_rel.filter(pc_rel['kin'] > 0.125)

    Starting from the above pairs, prune individuals from a dataset until no
    close relationships remain:

    >>> related_samples_to_remove = hl.maximal_independent_set(pairs.i, pairs.j, False)
    >>> result = dataset.filter_cols(
    ...     hl.is_defined(related_samples_to_remove[dataset.col_key]), keep=False)

    Starting from the above pairs, prune individuals from a dataset until no
    close relationships remain, preferring to keep cases over controls:

    >>> samples = dataset.cols()
    >>> pairs_with_case = pairs.key_by(
    ...     i=hl.struct(id=pairs.i, is_case=samples[pairs.i].is_case),
    ...     j=hl.struct(id=pairs.j, is_case=samples[pairs.j].is_case))
    >>> def tie_breaker(l, r):
    ...     return hl.cond(l.is_case & ~r.is_case, -1,
    ...                    hl.cond(~l.is_case & r.is_case, 1, 0))
    >>> related_samples_to_remove = hl.maximal_independent_set(
    ...    pairs_with_case.i, pairs_with_case.j, False, tie_breaker)
    >>> result = dataset.filter_cols(hl.is_defined(
    ...     related_samples_to_remove.key_by(
    ...        s = related_samples_to_remove.node.id.s)[dataset.col_key]), keep=False)


    The vertex set of the graph is implicitly all the values realized by `i`
    and `j` on the rows of this table. Each row of the table corresponds to an
    undirected edge between the vertices given by evaluating `i` and `j` on
    that row. An undirected edge may appear multiple times in the table and
    will not affect the output. Vertices with self-edges are removed as they
    are not independent of themselves.

    The expressions for `i` and `j` must have the same type.

    The value of `keep` determines whether the vertices returned are those
    in the maximal independent set, or those in the complement of this set.
    This is useful if you need to filter a table without removing vertices that
    don't appear in the graph at all.

    This method implements a greedy algorithm which iteratively removes a
    vertex of highest degree until the graph contains no edges. The greedy
    algorithm always returns an independent set, but the set may not always
    be perfectly maximal.

    `tie_breaker` is a Python function taking two arguments---say `l` and
    `r`---each of which is an :class:`Expression` of the same type as `i` and
    `j`. `tie_breaker` returns a :class:`NumericExpression`, which defines an
    ordering on nodes. A pair of nodes can be ordered in one of three ways, and
    `tie_breaker` must encode the relationship as follows:

     - if ``l < r`` then ``tie_breaker`` evaluates to some negative integer
     - if ``l == r`` then ``tie_breaker`` evaluates to 0
     - if ``l > r`` then ``tie_breaker`` evaluates to some positive integer

    For example, the usual ordering on the integers is defined by: ``l - r``.

    The `tie_breaker` function must satisfy the following property:
    ``tie_breaker(l, r) == -tie_breaker(r, l)``.

    When multiple nodes have the same degree, this algorithm will order the
    nodes according to ``tie_breaker`` and remove the *largest* node.

    i : :class:`.Expression`
        Expression to compute one endpoint of an edge.
    j : :class:`.Expression`
        Expression to compute another endpoint of an edge.
    keep : :obj:`bool`
        If ``True``, return vertices in set. If ``False``, return vertices removed.
    tie_breaker : function
        Function used to order nodes with equal degree.

        Table with the set of independent vertices. The table schema is one row
        field `node` which has the same type as input expressions `i` and `j`.

    if i.dtype != j.dtype:
        raise ValueError("'maximal_independent_set' expects arguments `i` and `j` to have same type. "
                         "Found {} and {}.".format(i.dtype, j.dtype))

    source = i._indices.source
    if not isinstance(source, Table):
        raise ValueError("'maximal_independent_set' expects an expression of 'Table'. Found {}".format(
            "expression of '{}'".format(
                source.__class__) if source is not None else 'scalar expression'))

    if i._indices.source != j._indices.source:
        raise ValueError(
            "'maximal_independent_set' expects arguments `i` and `j` to be expressions of the same Table. "
            "Found\n{}\n{}".format(i, j))

    node_t = i.dtype

    if tie_breaker:
        wrapped_node_t = ttuple(node_t)
        l = construct_variable('l', wrapped_node_t)
        r = construct_variable('r', wrapped_node_t)
        tie_breaker_expr = hl.int64(tie_breaker(l[0], r[0]))
        t, _ = source._process_joins(i, j, tie_breaker_expr)
        tie_breaker_str = str(tie_breaker_expr._ir)
        t, _ = source._process_joins(i, j)
        tie_breaker_str = None

    nodes = (t.select(node=[i, j])

    edges = t.key_by().select('i', 'j')
    nodes_in_set = Env.hail().utils.Graph.maximalIndependentSet(edges._jt.collect(), node_t._jtype, joption(tie_breaker_str))

    nt = Table._from_java(nodes._jt.annotateGlobal(nodes_in_set, hl.tset(node_t)._jtype, 'nodes_in_set'))
    nt = (nt
          .filter(nt.nodes_in_set.contains(nt.node), keep)

    return nt
Exemple #24
def vep(dataset: Union[Table, MatrixTable], config, block_size=1000, name='vep', csq=False):
    """Annotate variants with VEP.

    .. include:: ../_templates/req_tvariant.rst

    :func:`.vep` runs `Variant Effect Predictor
    <http://www.ensembl.org/info/docs/tools/vep/index.html>`__ on the
    current dataset and adds the result as a row field.


    Add VEP annotations to the dataset:

    >>> result = hl.vep(dataset, "data/vep-configuration.json") # doctest: +SKIP



    :func:`.vep` needs a configuration file to tell it how to run VEP.
    The format of the configuration file is JSON, and :func:`.vep`
    expects a JSON object with three fields:

    - `command` (array of string) -- The VEP command line to run.  The string literal `__OUTPUT_FORMAT_FLAG__` is replaced with `--json` or `--vcf` depending on `csq`.
    - `env` (object) -- A map of environment variables to values to add to the environment when invoking the command.  The value of each object member must be a string.
    - `vep_json_schema` (string): The type of the VEP JSON schema (as produced by the VEP when invoked with the `--json` option).  Note: This is the old-style 'parseable' Hail type syntax.  This will change.

    Here is an example configuration file for invoking VEP release 85
    installed in `/vep` with the Loftee plugin:

    .. code-block:: text

            "command": [
                "--format", "vcf",
                "--cache", "--offline",
                "--assembly", "GRCh37",
                "--plugin", "LoF,human_ancestor_fa:/root/.vep/loftee_data/human_ancestor.fa.gz,filter_position:0.05,min_intron_size:15,conservation_file:/root/.vep/loftee_data/phylocsf_gerp.sql,gerp_file:/root/.vep/loftee_data/GERP_scores.final.sorted.txt.gz",
                "-o", "STDOUT"
            "env": {
                "PERL5LIB": "/vep_data/loftee"
            "vep_json_schema": "Struct{assembly_name:String,allele_string:String,ancestral:String,colocated_variants:Array[Struct{aa_allele:String,aa_maf:Float64,afr_allele:String,afr_maf:Float64,allele_string:String,amr_allele:String,amr_maf:Float64,clin_sig:Array[String],end:Int32,eas_allele:String,eas_maf:Float64,ea_allele:String,ea_maf:Float64,eur_allele:String,eur_maf:Float64,exac_adj_allele:String,exac_adj_maf:Float64,exac_allele:String,exac_afr_allele:String,exac_afr_maf:Float64,exac_amr_allele:String,exac_amr_maf:Float64,exac_eas_allele:String,exac_eas_maf:Float64,exac_fin_allele:String,exac_fin_maf:Float64,exac_maf:Float64,exac_nfe_allele:String,exac_nfe_maf:Float64,exac_oth_allele:String,exac_oth_maf:Float64,exac_sas_allele:String,exac_sas_maf:Float64,id:String,minor_allele:String,minor_allele_freq:Float64,phenotype_or_disease:Int32,pubmed:Array[Int32],sas_allele:String,sas_maf:Float64,somatic:Int32,start:Int32,strand:Int32}],context:String,end:Int32,id:String,input:String,intergenic_consequences:Array[Struct{allele_num:Int32,consequence_terms:Array[String],impact:String,minimised:Int32,variant_allele:String}],most_severe_consequence:String,motif_feature_consequences:Array[Struct{allele_num:Int32,consequence_terms:Array[String],high_inf_pos:String,impact:String,minimised:Int32,motif_feature_id:String,motif_name:String,motif_pos:Int32,motif_score_change:Float64,strand:Int32,variant_allele:String}],regulatory_feature_consequences:Array[Struct{allele_num:Int32,biotype:String,consequence_terms:Array[String],impact:String,minimised:Int32,regulatory_feature_id:String,variant_allele:String}],seq_region_name:String,start:Int32,strand:Int32,transcript_consequences:Array[Struct{allele_num:Int32,amino_acids:String,biotype:String,canonical:Int32,ccds:String,cdna_start:Int32,cdna_end:Int32,cds_end:Int32,cds_start:Int32,codons:String,consequence_terms:Array[String],distance:Int32,domains:Array[Struct{db:String,name:String}],exon:String,gene_id:String,gene_pheno:Int32,gene_symbol:String,gene_symbol_source:String,hgnc_id:String,hgvsc:String,hgvsp:String,hgvs_offset:Int32,impact:String,intron:String,lof:String,lof_flags:String,lof_filter:String,lof_info:String,minimised:Int32,polyphen_prediction:String,polyphen_score:Float64,protein_end:Int32,protein_start:Int32,protein_id:String,sift_prediction:String,sift_score:Float64,strand:Int32,swissprot:String,transcript_id:String,trembl:String,uniparc:String,variant_allele:String}],variant_class:String}"


    A new row field is added in the location specified by `name` with type given
    by the type given by the `json_vep_schema` (if `csq` is ``False``) or
    :py:data:`.tstr` (if `csq` is ``True``).

    If csq is ``True``, then the CSQ header string is also added as a global
    field with name ``name + '_csq_header'``.

    dataset : :class:`.MatrixTable` or :class:`.Table`
    config : :obj:`str`
        Path to VEP configuration file.
    block_size : :obj:`int`
        Number of rows to process per VEP invocation.
    name : :obj:`str`
        Name for resulting row field.
    csq : :obj:`bool`
        If ``True``, annotates with the VCF CSQ field as a :py:data:`.tstr`.
        If ``False``, annotates as the `vep_json_schema`.

    :class:`.MatrixTable` or :class:`.Table`
        Dataset with new row-indexed field `name` containing VEP annotations.

    if isinstance(dataset, MatrixTable):
        require_row_key_variant(dataset, 'vep')
        ht = dataset.select_rows().rows()
        require_table_key_variant(dataset, 'vep')
        ht = dataset.select()

    annotations = Table(TableToTableApply(ht._tir,
                                          {'name': 'VEP',
                                           'config': config,
                                           'csq': csq,
                                           'blockSize': block_size})).persist()

    if csq:
        dataset = dataset.annotate_globals(
            **{name + '_csq_header': annotations.index_globals()['vep_csq_header']})

    if isinstance(dataset, MatrixTable):
        return dataset.annotate_rows(**{name: annotations[dataset.row_key].vep})
        return dataset.annotate(**{name: annotations[dataset.key].vep})
Exemple #25
 def from_spark(self, df, key):
     return Table._from_java(Env.hail().table.Table.fromDF(Env.hc()._jhc, df._jdf, key))
Exemple #26
 def persist_table(self, t, storage_level):
     return Table._from_java(
Exemple #27
 def persist_table(self, t, storage_level):
     return Table._from_java(self._to_java_ir(t._tir).pyPersist(storage_level))
Exemple #28
 def persist_table(self, ht, storage_level):
     return Table._from_java(ht._jt.persist(storage_level))
Exemple #29
 def unpersist_table(self, t):
     return Table._from_java(self._to_java_ir(t._tir).pyUnpersist())
Exemple #30
def maximal_independent_set(i, j, keep=True, tie_breaker=None) -> Table:
    """Return a table containing the vertices in a near
    `maximal independent set <https://en.wikipedia.org/wiki/Maximal_independent_set>`_
    of an undirected graph whose edges are given by a two-column table.


    Prune individuals from a dataset until no close relationships remain with
    respect to a PC-Relate measure of kinship.

    >>> pc_rel = hl.pc_relate(dataset.GT, 0.001, k=2, statistics='kin')
    >>> pairs = pc_rel.filter(pc_rel['kin'] > 0.125)
    >>> pairs = pairs.key_by(i=pairs.i.s, j=pairs.j.s).select()
    >>> related_samples_to_remove = hl.maximal_independent_set(pairs.i, pairs.j, False)
    >>> result = dataset.filter_cols(hl.is_defined(related_samples_to_remove[dataset.s]), keep=False)

    Prune individuals from a dataset, preferring to keep cases over controls.

    >>> pc_rel = hl.pc_relate(dataset.GT, 0.001, k=2, statistics='kin')
    >>> pairs = pc_rel.filter(pc_rel['kin'] > 0.125)
    >>> pairs = pairs.key_by(i=pairs.i.s, j=pairs.j.s).select()
    >>> samples = dataset.cols()
    >>> pairs_with_case = pairs.key_by(
    ...     i=hl.struct(id=pairs.i, is_case=samples[pairs.i].is_case),
    ...     j=hl.struct(id=pairs.j, is_case=samples[pairs.j].is_case))
    >>> def tie_breaker(l, r):
    ...     return hl.cond(l.is_case & ~r.is_case, -1,
    ...                    hl.cond(~l.is_case & r.is_case, 1, 0))
    >>> related_samples_to_remove = hl.maximal_independent_set(
    ...    pairs_with_case.i, pairs_with_case.j, False, tie_breaker)
    >>> result = dataset.filter_cols(hl.is_defined(
    ...     related_samples_to_remove.select(
    ...        s = related_samples_to_remove.node.id).key_by('s')[dataset.s]), keep=False)


    The vertex set of the graph is implicitly all the values realized by `i`
    and `j` on the rows of this table. Each row of the table corresponds to an
    undirected edge between the vertices given by evaluating `i` and `j` on
    that row. An undirected edge may appear multiple times in the table and
    will not affect the output. Vertices with self-edges are removed as they
    are not independent of themselves.

    The expressions for `i` and `j` must have the same type.

    The value of `keep` determines whether the vertices returned are those
    in the maximal independent set, or those in the complement of this set.
    This is useful if you need to filter a table without removing vertices that
    don't appear in the graph at all.

    This method implements a greedy algorithm which iteratively removes a
    vertex of highest degree until the graph contains no edges. The greedy
    algorithm always returns an independent set, but the set may not always
    be perfectly maximal.

    `tie_breaker` is a Python function taking two arguments---say `l` and
    `r`---each of which is an :class:`Expression` of the same type as `i` and
    `j`. `tie_breaker` returns a :class:`NumericExpression`, which defines an
    ordering on nodes. A pair of nodes can be ordered in one of three ways, and
    `tie_breaker` must encode the relationship as follows:

     - if ``l < r`` then ``tie_breaker`` evaluates to some negative integer
     - if ``l == r`` then ``tie_breaker`` evaluates to 0
     - if ``l > r`` then ``tie_breaker`` evaluates to some positive integer

    For example, the usual ordering on the integers is defined by: ``l - r``.

    The `tie_breaker` function must satisfy the following property:
    ``tie_breaker(l, r) == -tie_breaker(r, l)``.

    When multiple nodes have the same degree, this algorithm will order the
    nodes according to ``tie_breaker`` and remove the *largest* node.

    i : :class:`.Expression`
        Expression to compute one endpoint of an edge.
    j : :class:`.Expression`
        Expression to compute another endpoint of an edge.
    keep : :obj:`bool`
        If ``True``, return vertices in set. If ``False``, return vertices removed.
    tie_breaker : function
        Function used to order nodes with equal degree.

        Table with the set of independent vertices. The table schema is one row
        field `node` which has the same type as input expressions `i` and `j`.

    if i.dtype != j.dtype:
        raise ValueError(
            "'maximal_independent_set' expects arguments `i` and `j` to have same type. "
            "Found {} and {}.".format(i.dtype, j.dtype))

    source = i._indices.source
    if not isinstance(source, Table):
        raise ValueError(
            "'maximal_independent_set' expects an expression of 'Table'. Found {}"
            .format("expression of '{}'".format(source.__class__)
                    if source is not None else 'scalar expression'))

    if i._indices.source != j._indices.source:
        raise ValueError(
            "'maximal_independent_set' expects arguments `i` and `j` to be expressions of the same Table. "
            "Found\n{}\n{}".format(i, j))

    node_t = i.dtype

    if tie_breaker:
        wrapped_node_t = ttuple(node_t)
        l = construct_expr(VariableReference('l'), wrapped_node_t)
        r = construct_expr(VariableReference('r'), wrapped_node_t)
        tie_breaker_expr = hl.int64(tie_breaker(l[0], r[0]))
        t, _ = source._process_joins(i, j, tie_breaker_expr)
        tie_breaker_hql = tie_breaker_expr._ast.to_hql()
        t, _ = source._process_joins(i, j)
        tie_breaker_hql = None

    nodes = (t.select(node=[i, j]).explode('node').key_by('node').select())

    edges = t.key_by(None).select('i', 'j')
    nodes_in_set = Env.hail().utils.Graph.maximalIndependentSet(
        edges._jt.collect(), node_t._jtype, joption(tie_breaker_hql))

    nt = Table(
                                 hl.tset(node_t)._jtype, 'nodes_in_set'))
    nt = (nt.filter(nt.nodes_in_set.contains(nt.node),

    return nt
Exemple #31
 def from_spark(self, df, key):
     return Table._from_java(Env.hail().table.Table.pyFromDF(df._jdf, key))
Exemple #32
 def persist_table(self, t, storage_level):
     return Table._from_java(self._to_java_ir(t._tir).pyPersist(storage_level))
Exemple #33
 def from_pandas(self, df, key):
     return Table.from_spark(Env.sql_context().createDataFrame(df), key)
Exemple #34
 def from_spark(self, df, key):
     return Table._from_java(Env.hail().table.Table.pyFromDF(df._jdf, key))
Exemple #35
    def fit_alternatives_numpy(self, pa, a=None):
        r"""Fit and test alternative model for each augmented design matrix.

        The resulting table has the following fields:

        .. list-table::
          :header-rows: 1

          * - Field
            - Type
            - Value
          * - `idx`
            - int64
            - Index of augmented design matrix.
          * - `beta`
            - float64
            - :math:`\beta_\star`
          * - `sigma_sq`
            - float64
            - :math:`\sigma^2`
          * - `chi_sq`
            - float64
            - :math:`\chi^2`
          * - `p_value`
            - float64
            - p-value

        pa: :class:`numpy.ndarray`
            Projected matrix :math:`P_r A` of alternatives with shape :math:`(r, m)`.
            Each column is a projected augmentation :math:`P_r x_\star` of :math:`P_r X`.
        a: :class:`numpy.ndarray`, optional
            Matrix :math:`A` of alternatives with shape :math:`(n, m)`.
            Each column is an augmentation :math:`x_\star` of :math:`X`.
            Required for low-rank inference.

            Table of results for each augmented design matrix.
        self._check_dof(self.f + 1)

        if not self._fitted:
            raise Exception("null model is not fit. Run 'fit' first.")

        n_cols = pa.shape[1]
        assert pa.shape[0] == self.r

        if self.low_rank:
            assert a.shape[0] == self.n and a.shape[1] == n_cols
            data = [(i, ) + self._fit_alternative_numpy(pa[:, i], a[:, i])
                    for i in range(n_cols)]
            data = [(i, ) + self._fit_alternative_numpy(pa[:, i], None)
                    for i in range(n_cols)]

        df = pd.DataFrame.from_records(
            data, columns=['idx', 'beta', 'sigma_sq', 'chi_sq', 'p_value'])

        return Table.from_pandas(df, key='idx')
Exemple #36
def pc_relate(call_expr,
              include_self_kinship=False) -> Table:
    r"""Compute relatedness estimates between individuals using a variant of the
    PC-Relate method.

    .. include:: ../_templates/req_diploid_gt.rst

    Estimate kinship, identity-by-descent two, identity-by-descent one, and
    identity-by-descent zero for every pair of samples, using a minimum minor
    allele frequency filter of 0.01 and 10 principal components to control
    for population structure.

    >>> rel = hl.pc_relate(dataset.GT, 0.01, k=10)

    Only compute the kinship statistic. This is more efficient than
    computing all statistics.

    >>> rel = hl.pc_relate(dataset.GT, 0.01, k=10, statistics='kin')

    Compute all statistics, excluding sample-pairs with kinship less
    than 0.1. This is more efficient than producing the full table and
    then filtering using :meth:`.Table.filter`.

    >>> rel = hl.pc_relate(dataset.GT, 0.01, k=10, min_kinship=0.1)

    One can also pass in pre-computed principal component scores.
    To produce the same results as in the previous example:

    >>> _, scores_table, _ = hl.hwe_normalized_pca(dataset.GT,
    ...                                      k=10,
    ...                                      compute_loadings=False)
    >>> rel = hl.pc_relate(dataset.GT,
    ...                    0.01,
    ...                    scores_expr=scores_table[dataset.col_key].scores,
    ...                    min_kinship=0.1)

    The traditional estimator for kinship between a pair of individuals
    :math:`i` and :math:`j`, sharing the set :math:`S_{ij}` of
    single-nucleotide variants, from a population with allele frequencies
    :math:`p_s`, is given by:

    .. math::

      \widehat{\phi_{ij}} \coloneqq
        \sum_{s \in S_{ij}}
          \frac{(g_{is} - 2 p_s) (g_{js} - 2 p_s)}
                {4 \sum_{s \in S_{ij}} p_s (1 - p_s)}

    This estimator is true under the model that the sharing of common
    (relative to the population) alleles is not very informative to
    relatedness (because they're common) and the sharing of rare alleles
    suggests a recent common ancestor from which the allele was inherited by

    When multiple ancestry groups are mixed in a sample, this model breaks
    down. Alleles that are rare in all but one ancestry group are treated as
    very informative to relatedness. However, these alleles are simply
    markers of the ancestry group. The PC-Relate method corrects for this
    situation and the related situation of admixed individuals.

    PC-Relate slightly modifies the usual estimator for relatedness:
    occurrences of population allele frequency are replaced with an
    "individual-specific allele frequency". This modification allows the
    method to correctly weight an allele according to an individual's unique
    ancestry profile.

    The "individual-specific allele frequency" at a given genetic locus is
    modeled by PC-Relate as a linear function of a sample's first ``k``
    principal component coordinates. As such, the efficacy of this method
    rests on two assumptions:

     - an individual's first `k` principal component coordinates fully
       describe their allele-frequency-relevant ancestry, and

     - the relationship between ancestry (as described by principal
       component coordinates) and population allele frequency is linear

    The estimators for kinship, and identity-by-descent zero, one, and two
    follow. Let:

     - :math:`S_{ij}` be the set of genetic loci at which both individuals
       :math:`i` and :math:`j` have a defined genotype

     - :math:`g_{is} \in {0, 1, 2}` be the number of alternate alleles that
       individual :math:`i` has at genetic locus :math:`s`

     - :math:`\widehat{\mu_{is}} \in [0, 1]` be the individual-specific allele
       frequency for individual :math:`i` at genetic locus :math:`s`

     - :math:`{\widehat{\sigma^2_{is}}} \coloneqq \widehat{\mu_{is}} (1 - \widehat{\mu_{is}})`,
       the binomial variance of :math:`\widehat{\mu_{is}}`

     - :math:`\widehat{\sigma_{is}} \coloneqq \sqrt{\widehat{\sigma^2_{is}}}`,
       the binomial standard deviation of :math:`\widehat{\mu_{is}}`

     - :math:`\text{IBS}^{(0)}_{ij} \coloneqq \sum_{s \in S_{ij}} \mathbb{1}_{||g_{is} - g_{js} = 2||}`,
       the number of genetic loci at which individuals :math:`i` and :math:`j`
       share no alleles

     - :math:`\widehat{f_i} \coloneqq 2 \widehat{\phi_{ii}} - 1`, the inbreeding
       coefficient for individual :math:`i`

     - :math:`g^D_{is}` be a dominance encoding of the genotype matrix, and
       :math:`X_{is}` be a normalized dominance-coded genotype matrix

    .. math::

        g^D_{is} \coloneqq
            \widehat{\mu_{is}}     & g_{is} = 0 \\
            0                        & g_{is} = 1 \\
            1 - \widehat{\mu_{is}} & g_{is} = 2

        X_{is} \coloneqq g^D_{is} - \widehat{\sigma^2_{is}} (1 - \widehat{f_i})

    The estimator for kinship is given by:

    .. math::

      \widehat{\phi_{ij}} \coloneqq
        \frac{\sum_{s \in S_{ij}}(g - 2 \mu)_{is} (g - 2 \mu)_{js}}
              {4 * \sum_{s \in S_{ij}}
                            \widehat{\sigma_{is}} \widehat{\sigma_{js}}}

    The estimator for identity-by-descent two is given by:

    .. math::

      \widehat{k^{(2)}_{ij}} \coloneqq
        \frac{\sum_{s \in S_{ij}}X_{is} X_{js}}{\sum_{s \in S_{ij}}
          \widehat{\sigma^2_{is}} \widehat{\sigma^2_{js}}}

    The estimator for identity-by-descent zero is given by:

    .. math::

      \widehat{k^{(0)}_{ij}} \coloneqq
                {\sum_{s \in S_{ij}}
                       \widehat{\mu_{is}}^2(1 - \widehat{\mu_{js}})^2
                       + (1 - \widehat{\mu_{is}})^2\widehat{\mu_{js}}^2}
            & \widehat{\phi_{ij}} > 2^{-5/2} \\
          1 - 4 \widehat{\phi_{ij}} + k^{(2)}_{ij}
            & \widehat{\phi_{ij}} \le 2^{-5/2}

    The estimator for identity-by-descent one is given by:

    .. math::

      \widehat{k^{(1)}_{ij}} \coloneqq
        1 - \widehat{k^{(2)}_{ij}} - \widehat{k^{(0)}_{ij}}

    Note that, even if present, phase information is ignored by this method.

    The PC-Relate method is described in "Model-free Estimation of Recent
    Genetic Relatedness". Conomos MP, Reiner AP, Weir BS, Thornton TA. in
    American Journal of Human Genetics. 2016 Jan 7. The reference
    implementation is available in the `GENESIS Bioconductor package
    <https://bioconductor.org/packages/release/bioc/html/GENESIS.html>`_ .

    :func:`.pc_relate` differs from the reference implementation in a few

     - if `k` is supplied, samples scores are computed via PCA on all samples,
       not a specified subset of genetically unrelated samples. The latter
       can be achieved by filtering samples, computing PCA variant loadings,
       and using these loadings to compute and pass in scores for all samples.

     - the estimators do not perform small sample correction

     - the algorithm does not provide an option to use population-wide
       allele frequency estimates

     - the algorithm does not provide an option to not use "overall
       standardization" (see R ``pcrelate`` documentation)

    Under the PC-Relate model, kinship, :math:`\phi_{ij}`, ranges from 0 to
    0.5, and is precisely half of the
    fraction-of-genetic-material-shared. Listed below are the statistics for
    a few pairings:

     - Monozygotic twins share all their genetic material so their kinship
       statistic is 0.5 in expection.

     - Parent-child and sibling pairs both have kinship 0.25 in expectation
       and are separated by the identity-by-descent-zero, :math:`k^{(2)}_{ij}`,
       statistic which is zero for parent-child pairs and 0.25 for sibling

     - Avuncular pairs and grand-parent/-child pairs both have kinship 0.125
       in expectation and both have identity-by-descent-zero 0.5 in expectation

     - "Third degree relatives" are those pairs sharing
       :math:`2^{-3} = 12.5 %` of their genetic material, the results of
       PCRelate are often too noisy to reliably distinguish these pairs from
       higher-degree-relative-pairs or unrelated pairs.

    Note that :math:`g_{is}` is the number of alternate alleles. Hence, for
    multi-allelic variants, a value of 2 may indicate two distinct alternative
    alleles rather than a homozygous variant genotype. To enforce the latter,
    either filter or split multi-allelic variants first.

    The resulting table has the first 3, 4, 5, or 6 fields below, depending on
    the `statistics` parameter:

     - `i` (``col_key.dtype``) -- First sample. (key field)
     - `j` (``col_key.dtype``) -- Second sample. (key field)
     - `kin` (:py:data:`.tfloat64`) -- Kinship estimate, :math:`\widehat{\phi_{ij}}`.
     - `ibd2` (:py:data:`.tfloat64`) -- IBD2 estimate, :math:`\widehat{k^{(2)}_{ij}}`.
     - `ibd0` (:py:data:`.tfloat64`) -- IBD0 estimate, :math:`\widehat{k^{(0)}_{ij}}`.
     - `ibd1` (:py:data:`.tfloat64`) -- IBD1 estimate, :math:`\widehat{k^{(1)}_{ij}}`.

    Here ``col_key`` refers to the column key of the source matrix table,
    and ``col_key.dtype`` is a struct containing the column key fields.

    There is one row for each pair of distinct samples (columns), where `i`
    corresponds to the column of smaller column index. In particular, if the
    same column key value exists for :math:`n` columns, then the resulting
    table will have :math:`\binom{n-1}{2}` rows with both key fields equal to
    that column key value. This may result in unexpected behavior in downstream

    call_expr : :class:`.CallExpression`
        Entry-indexed call expression.
    min_individual_maf : :obj:`float`
        The minimum individual-specific minor allele frequency.
        If either individual-specific minor allele frequency for a pair of
        individuals is below this threshold, then the variant will not
        be used to estimate relatedness for the pair.
    k : :obj:`int`, optional
        If set, `k` principal component scores are computed and used.
        Exactly one of `k` and `scores_expr` must be specified.
    scores_expr : :class:`.ArrayNumericExpression`, optional
        Column-indexed expression of principal component scores, with the same
        source as `call_expr`. All array values must have the same positive length,
        corresponding to the number of principal components, and all scores must
        be non-missing. Exactly one of `k` and `scores_expr` must be specified.
    min_kinship : :obj:`float`, optional
        If set, pairs of samples with kinship lower than `min_kinship` are excluded
        from the results.
    statistics : :class:`str`
        Set of statistics to compute.
        If ``'kin'``, only estimate the kinship statistic.
        If ``'kin2'``, estimate the above and IBD2.
        If ``'kin20'``, estimate the above and IBD0.
        If ``'all'``, estimate the above and IBD1.
    block_size : :obj:`int`, optional
        Block size of block matrices used in the algorithm.
        Default given by :meth:`.BlockMatrix.default_block_size`.
    include_self_kinship: :obj:`bool`
        If ``True``, include entries for an individual's estimated kinship with
        themselves. Defaults to ``False``.

        A :class:`.Table` mapping pairs of samples to their pair-wise statistics.
    mt = matrix_table_source('pc_relate/call_expr', call_expr)

    if k and scores_expr is None:
        _, scores, _ = hwe_normalized_pca(call_expr, k, compute_loadings=False)
        scores_expr = scores[mt.col_key].scores
    elif not k and scores_expr is not None:
        analyze('pc_relate/scores_expr', scores_expr, mt._col_indices)
    elif k and scores_expr is not None:
        raise ValueError(
            "pc_relate: exactly one of 'k' and 'scores_expr' must be set, found both"
        raise ValueError(
            "pc_relate: exactly one of 'k' and 'scores_expr' must be set, found neither"

    scores_table = mt.select_cols(__scores=scores_expr)\

    n_missing = scores_table.aggregate(
    if n_missing > 0:
        raise ValueError(
            f'Found {n_missing} columns with missing scores array.')

    mt = mt.select_entries(__gt=call_expr.n_alt_alleles()).unfilter_entries()
    mt = mt.annotate_rows(__mean_gt=agg.mean(mt.__gt))
    mean_imputed_gt = hl.or_else(hl.float64(mt.__gt), mt.__mean_gt)

    if not block_size:
        block_size = BlockMatrix.default_block_size()

    g = BlockMatrix.from_entry_expr(mean_imputed_gt, block_size=block_size)

    pcs = scores_table.collect(_localize=False).map(lambda x: x.__scores)

    ht = Table(
            g._bmir, pcs._ir, {
                'name': 'PCRelate',
                'maf': min_individual_maf,
                'blockSize': block_size,
                'minKinship': min_kinship,
                'statistics': {
                    'kin': 0,
                    'kin2': 1,
                    'kin20': 2,
                    'all': 3

    if statistics == 'kin':
        ht = ht.drop('ibd0', 'ibd1', 'ibd2')
    elif statistics == 'kin2':
        ht = ht.drop('ibd0', 'ibd1')
    elif statistics == 'kin20':
        ht = ht.drop('ibd1')

    if not include_self_kinship:
        ht = ht.filter(ht.i == ht.j, keep=False)

    col_keys = hl.literal(mt.select_cols().key_cols_by().cols().collect(),
    return ht.key_by(i=col_keys[ht.i], j=col_keys[ht.j])
Exemple #37
def nirvana(dataset: Union[MatrixTable, Table], config, block_size=500000, name='nirvana'):
    """Annotate variants using `Nirvana <https://github.com/Illumina/Nirvana>`_.

    .. include:: ../_templates/experimental.rst

    .. include:: ../_templates/req_tvariant.rst

    :func:`.nirvana` runs `Nirvana
    <https://github.com/Illumina/Nirvana>`_ on the current dataset and adds a
    new row field in the location specified by `name`.


    Add Nirvana annotations to the dataset:

    >>> result = hl.nirvana(dataset, "data/nirvana.properties") # doctest: +SKIP


    :func:`.nirvana` requires a configuration file. The format is a
    `.properties file <https://en.wikipedia.org/wiki/.properties>`__, where each
    line defines a property as a key-value pair of the form ``key = value``.
    :func:`.nirvana` supports the following properties:

    - **hail.nirvana.dotnet** -- Location of dotnet. Optional, default: dotnet.
    - **hail.nirvana.path** -- Value of the PATH environment variable when
      invoking Nirvana. Optional, by default PATH is not set.
    - **hail.nirvana.location** -- Location of Nirvana.dll. Required.
    - **hail.nirvana.reference** -- Location of reference genome. Required.
    - **hail.nirvana.cache** -- Location of cache. Required.
    - **hail.nirvana.supplementaryAnnotationDirectory** -- Location of
      Supplementary Database. Optional, no supplementary database by default.

    Here is an example ``nirvana.properties`` configuration file:

    .. code-block:: text

        hail.nirvana.location = /path/to/dotnet/netcoreapp2.0/Nirvana.dll
        hail.nirvana.reference = /path/to/nirvana/References/Homo_sapiens.GRCh37.Nirvana.dat
        hail.nirvana.cache = /path/to/nirvana/Cache/GRCh37/Ensembl
        hail.nirvana.supplementaryAnnotationDirectory = /path/to/nirvana/SupplementaryDatabase/GRCh37


    A new row field is added in the location specified by `name` with the
    following schema:

    .. code-block:: text

        struct {
            chromosome: str,
            refAllele: str,
            position: int32,
            altAlleles: array<str>,
            cytogeneticBand: str,
            quality: float64,
            filters: array<str>,
            jointSomaticNormalQuality: int32,
            copyNumber: int32,
            strandBias: float64,
            recalibratedQuality: float64,
            variants: array<struct {
                altAllele: str,
                refAllele: str,
                chromosome: str,
                begin: int32,
                end: int32,
                phylopScore: float64,
                isReferenceMinor: bool,
                variantType: str,
                vid: str,
                hgvsg: str,
                isRecomposedVariant: bool,
                isDecomposedVariant: bool,
                regulatoryRegions: array<struct {
                    id: str,
                    type: str,
                    consequence: set<str>
                clinvar: array<struct {
                    id: str,
                    reviewStatus: str,
                    isAlleleSpecific: bool,
                    alleleOrigins: array<str>,
                    refAllele: str,
                    altAllele: str,
                    phenotypes: array<str>,
                    medGenIds: array<str>,
                    omimIds: array<str>,
                    orphanetIds: array<str>,
                    significance: str,
                    lastUpdatedDate: str,
                    pubMedIds: array<str>
                cosmic: array<struct {
                    id: str,
                    isAlleleSpecific: bool,
                    refAllele: str,
                    altAllele: str,
                    gene: str,
                    sampleCount: int32,
                    studies: array<struct {
                        id: int32,
                        histology: str,
                        primarySite: str
                dbsnp: struct {
                    ids: array<str>
                globalAllele: struct {
                    globalMinorAllele: str,
                    globalMinorAlleleFrequency: float64
                gnomad: struct {
                    coverage: str,
                    allAf: float64,
                    allAc: int32,
                    allAn: int32,
                    allHc: int32,
                    afrAf: float64,
                    afrAc: int32,
                    afrAn: int32,
                    afrHc: int32,
                    amrAf: float64,
                    amrAc: int32,
                    amrAn: int32,
                    amrHc: int32,
                    easAf: float64,
                    easAc: int32,
                    easAn: int32,
                    easHc: int32,
                    finAf: float64,
                    finAc: int32,
                    finAn: int32,
                    finHc: int32,
                    nfeAf: float64,
                    nfeAc: int32,
                    nfeAn: int32,
                    nfeHc: int32,
                    othAf: float64,
                    othAc: int32,
                    othAn: int32,
                    othHc: int32,
                    asjAf: float64,
                    asjAc: int32,
                    asjAn: int32,
                    asjHc: int32,
                    failedFilter: bool
                gnomadExome: struct {
                    coverage: str,
                    allAf: float64,
                    allAc: int32,
                    allAn: int32,
                    allHc: int32,
                    afrAf: float64,
                    afrAc: int32,
                    afrAn: int32,
                    afrHc: int32,
                    amrAf: float64,
                    amrAc: int32,
                    amrAn: int32,
                    amrHc: int32,
                    easAf: float64,
                    easAc: int32,
                    easAn: int32,
                    easHc: int32,
                    finAf: float64,
                    finAc: int32,
                    finAn: int32,
                    finHc: int32,
                    nfeAf: float64,
                    nfeAc: int32,
                    nfeAn: int32,
                    nfeHc: int32,
                    othAf: float64,
                    othAc: int32,
                    othAn: int32,
                    othHc: int32,
                    asjAf: float64,
                    asjAc: int32,
                    asjAn: int32,
                    asjHc: int32,
                    sasAf: float64,
                    sasAc: int32,
                    sasAn: int32,
                    sasHc: int32,
                    failedFilter: bool
                topmed: struct {
                    failedFilter: bool,
                    allAc: int32,
                    allAn: int32,
                    allAf: float64,
                    allHc: int32
                oneKg: struct {
                    ancestralAllele: str,
                    allAf: float64,
                    allAc: int32,
                    allAn: int32,
                    afrAf: float64,
                    afrAc: int32,
                    afrAn: int32,
                    amrAf: float64,
                    amrAc: int32,
                    amrAn: int32,
                    easAf: float64,
                    easAc: int32,
                    easAn: int32,
                    eurAf: float64,
                    eurAc: int32,
                    eurAn: int32,
                    sasAf: float64,
                    sasAc: int32,
                    sasAn: int32
                mitomap: array<struct {
                    refAllele: str,
                    altAllele: str,
                    diseases : array<str>,
                    hasHomoplasmy: bool,
                    hasHeteroplasmy: bool,
                    status: str,
                    clinicalSignificance: str,
                    scorePercentile: float64,
                    isAlleleSpecific: bool,
                    chromosome: str,
                    begin: int32,
                    end: int32,
                    variantType: str
                transcripts: struct {
                    refSeq: array<struct {
                        transcript: str,
                        bioType: str,
                        aminoAcids: str,
                        cdnaPos: str,
                        codons: str,
                        cdsPos: str,
                        exons: str,
                        introns: str,
                        geneId: str,
                        hgnc: str,
                        consequence: array<str>,
                        hgvsc: str,
                        hgvsp: str,
                        isCanonical: bool,
                        polyPhenScore: float64,
                        polyPhenPrediction: str,
                        proteinId: str,
                        proteinPos: str,
                        siftScore: float64,
                        siftPrediction: str
                    ensembl: array<struct {
                        transcript: str,
                        bioType: str,
                        aminoAcids: str,
                        cdnaPos: str,
                        codons: str,
                        cdsPos: str,
                        exons: str,
                        introns: str,
                        geneId: str,
                        hgnc: str,
                        consequence: array<str>,
                        hgvsc: str,
                        hgvsp: str,
                        isCanonical: bool,
                        polyPhenScore: float64,
                        polyPhenPrediction: str,
                        proteinId: str,
                        proteinPos: str,
                        siftScore: float64,
                        siftPrediction: str
                overlappingGenes: array<str>
            genes: array<struct {
                name: str,
                omim: array<struct {
                    mimNumber: int32,
                    hgnc: str,
                    description: str,
                    phenotypes: array<struct {
                        mimNumber: int32,
                        phenotype: str,
                        mapping: str,
                        inheritance: array<str>,
                        comments: str
                exac: struct {
                    pLi: float64,
                    pRec: float64,
                    pNull: float64

    dataset : :class:`.MatrixTable` or :class:`.Table`
    config : :obj:`str`
        Path to Nirvana configuration file.
    block_size : :obj:`int`
        Number of rows to process per Nirvana invocation.
    name : :obj:`str`
        Name for resulting row field.

    :class:`.MatrixTable` or :class:`.Table`
        Dataset with new row-indexed field `name` containing Nirvana annotations.
    if isinstance(dataset, MatrixTable):
        require_row_key_variant(dataset, 'nirvana')
        ht = dataset.select_rows().rows()
        require_table_key_variant(dataset, 'nirvana')
        ht = dataset.select()

    annotations = Table._from_java(Env.hail().methods.Nirvana.apply(ht._jt, config, block_size))

    if isinstance(dataset, MatrixTable):
        return dataset.annotate_rows(**{name: annotations[dataset.row_key].nirvana})
        return dataset.annotate(**{name: annotations[dataset.key].nirvana})
Exemple #38
 def from_pandas(self, df, key):
     return Table.from_spark(Env.sql_context().createDataFrame(df), key)
Exemple #39
def mendel_errors(dataset, pedigree):
    """Find Mendel errors; count per variant, individual and nuclear family.

    .. include:: ../_templates/req_tstring.rst

    .. include:: ../_templates/req_tvariant.rst

    .. include:: ../_templates/req_biallelic.rst


    Find all violations of Mendelian inheritance in each (dad, mom, kid) trio in
    a pedigree and return four tables (all errors, errors by family, errors by
    individual, errors by variant):

    >>> ped = hl.Pedigree.read('data/trios.fam')
    >>> all_errors, per_fam, per_sample, per_variant = hl.mendel_errors(dataset, ped)

    Export all mendel errors to a text file:

    >>> all_errors.export('output/all_mendel_errors.tsv')

    Annotate columns with the number of Mendel errors:

    >>> annotated_samples = dataset.annotate_cols(mendel=per_sample[dataset.s])

    Annotate rows with the number of Mendel errors:

    >>> annotated_variants = dataset.annotate_rows(mendel=per_variant[dataset.locus, dataset.alleles])


    The example above returns four tables, which contain Mendelian violations
    grouped in various ways. These tables are modeled after the `PLINK mendel
    formats <https://www.cog-genomics.org/plink2/formats#mendel>`_, resembling
    the ``.mendel``, ``.fmendel``, ``.imendel``, and ``.lmendel`` formats,

    **First table:** all Mendel errors. This table contains one row per Mendel
    error, keyed by the variant and proband id.

        - `fam_id` (:py:data:`.tstr`) -- Family ID.
        - (column key of `dataset`) (:py:data:`.tstr`) -- Proband ID, key field.
        - `locus` (:class:`.tlocus`) -- Variant locus, key field.
        - `alleles` (:class:`.tarray` of :py:data:`.tstr`) -- Variant alleles, key field.
        - `code` (:py:data:`.tint32`) -- Mendel error code, see below.
        - `error` (:py:data:`.tstr`) -- Readable representation of Mendel error.

    **Second table:** errors per nuclear family. This table contains one row
    per nuclear family, keyed by the parents.

        - `fam_id` (:py:data:`.tstr`) -- Family ID.
        - `pat_id` (:py:data:`.tstr`) -- Paternal ID. (key field)
        - `mat_id` (:py:data:`.tstr`) -- Maternal ID. (key field)
        - `children` (:py:data:`.tint32`) -- Number of children in this nuclear family.
        - `errors` (:py:data:`.tint32`) -- Number of Mendel errors in this nuclear family.
        - `snp_errors` (:py:data:`.tint32`) -- Number of Mendel errors at SNPs in this
          nuclear family.

    **Third table:** errors per individual. This table contains one row per
    individual. Each error is counted toward the proband, father, and mother
    according to the `Implicated` in the table below.

        - (column key of `dataset`) (:py:data:`.tstr`) -- Sample ID (key field).
        - `fam_id` (:py:data:`.tstr`) -- Family ID.
        - `errors` (:py:data:`.tint64`) -- Number of Mendel errors involving this
        - `snp_errors` (:py:data:`.tint64`) -- Number of Mendel errors involving this
          individual at SNPs.

    **Fourth table:** errors per variant.

        - `locus` (:class:`.tlocus`) -- Variant locus, key field.
        - `alleles` (:class:`.tarray` of :py:data:`.tstr`) -- Variant alleles, key field.
        - `errors` (:py:data:`.tint32`) -- Number of Mendel errors in this variant.

    This method only considers complete trios (two parents and proband with
    defined sex). The code of each Mendel error is determined by the table
    below, extending the
    `Plink classification <https://www.cog-genomics.org/plink2/basic_stats#mendel>`__.

    In the table, the copy state of a locus with respect to a trio is defined
    as follows, where PAR is the `pseudoautosomal region
    <https://en.wikipedia.org/wiki/Pseudoautosomal_region>`__ (PAR) of X and Y
    defined by the reference genome and the autosome is defined by

    - Auto -- in autosome or in PAR or female child
    - HemiX -- in non-PAR of X and male child
    - HemiY -- in non-PAR of Y and male child

    `Any` refers to the set \{ HomRef, Het, HomVar, NoCall \} and `~`
    denotes complement in this set.

    | Code | Dad     | Mom     | Kid    | Copy State | Implicated    |
    |    1 | HomVar  | HomVar  | Het    | Auto       | Dad, Mom, Kid |
    |    2 | HomRef  | HomRef  | Het    | Auto       | Dad, Mom, Kid |
    |    3 | HomRef  | ~HomRef | HomVar | Auto       | Dad, Kid      |
    |    4 | ~HomRef | HomRef  | HomVar | Auto       | Mom, Kid      |
    |    5 | HomRef  | HomRef  | HomVar | Auto       | Kid           |
    |    6 | HomVar  | ~HomVar | HomRef | Auto       | Dad, Kid      |
    |    7 | ~HomVar | HomVar  | HomRef | Auto       | Mom, Kid      |
    |    8 | HomVar  | HomVar  | HomRef | Auto       | Kid           |
    |    9 | Any     | HomVar  | HomRef | HemiX      | Mom, Kid      |
    |   10 | Any     | HomRef  | HomVar | HemiX      | Mom, Kid      |
    |   11 | HomVar  | Any     | HomRef | HemiY      | Dad, Kid      |
    |   12 | HomRef  | Any     | HomVar | HemiY      | Dad, Kid      |

    dataset : :class:`.MatrixTable`
    pedigree : :class:`.Pedigree`
        Sample pedigree.

    (:class:`.Table`, :class:`.Table`, :class:`.Table`, :class:`.Table`)
        Four tables as detailed in notes with Mendel error statistics.

    dataset = require_biallelic(dataset, 'mendel_errors')

    kts = dataset._jvds.mendelErrors(pedigree._jrep)
    return Table(kts._1()), Table(kts._2()), \
           Table(kts._3()), Table(kts._4())