Exemple #1
0
def super_unify_types(*ts):
    ts = [t for t in ts if t is not None]
    if len(ts) == 0:
        return None
    t0 = ts[0]
    if all(is_numeric(t) for t in ts):
        return unify_types_limited(*ts)
    if any(not isinstance(t, type(t0)) for t in ts):
        return None
    if isinstance(t0, tarray):
        et = super_unify_types(*[t.element_type for t in ts])
        return tarray(et)
    if isinstance(t0, tset):
        et = super_unify_types(*[t.element_type for t in ts])
        return tset(et)
    if isinstance(t0, tdict):
        kt = super_unify_types(*[t.key_type for t in ts])
        vt = super_unify_types(*[t.value_type for t in ts])
        return tdict(kt, vt)
    if isinstance(t0, tstruct):
        keys = [k for t in ts for k in t.fields]
        kvs = {
            k: super_unify_types(*[t.get(k, None) for t in ts])
            for k in keys
        }
        return tstruct(**kvs)
    if all(t0 == t for t in ts):
        return t0

    return None
Exemple #2
0
    def _bin_op_numeric_unify_types(self, name, other):
        def numeric_proxy(t):
            if t == tbool:
                return tint32
            else:
                return t

        def scalar_type(t):
            if isinstance(t, tarray):
                return numeric_proxy(t.element_type)
            elif isinstance(t, tndarray):
                return numeric_proxy(t.element_type)
            else:
                return numeric_proxy(t)

        t = unify_types(scalar_type(self.dtype), scalar_type(other.dtype))
        if t is None:
            raise NotImplementedError("'{}' {} '{}'".format(
                self.dtype, name, other.dtype))

        if isinstance(self.dtype, tarray) or isinstance(other.dtype, tarray):
            return tarray(t)
        elif isinstance(self.dtype, tndarray):
            return tndarray(t, self.ndim)
        elif isinstance(other.dtype, tndarray):
            return tndarray(t, other.ndim)

        return t
Exemple #3
0
def require_table_key_variant(ht, method):
    if (list(ht.key) != ['locus', 'alleles']
            or not isinstance(ht['locus'].dtype, tlocus)
            or not ht['alleles'].dtype == tarray(tstr)):
        raise ValueError("Method '{}' requires key to be two fields 'locus' (type 'locus<any>') and "
                         "'alleles' (type 'array<str>')\n"
                         "  Found:{}".format(method, ''.join(
                             "\n    '{}': {}".format(k, str(ht[k].dtype)) for k in ht.key)))
Exemple #4
0
def require_row_key_variant_w_struct_locus(dataset, method):
    if (list(dataset.row_key) != ['locus', 'alleles']
        or not dataset['alleles'].dtype == tarray(tstr)
        or (not isinstance(dataset['locus'].dtype, tlocus)
            and dataset['locus'].dtype != hl.dtype('struct{contig: str, position: int32}'))):
        raise ValueError("Method '{}' requires row key to be two fields 'locus'"
                         " (type 'locus<any>' or 'struct{{contig: str, position: int32}}') and "
                         "'alleles' (type 'array<str>')\n"
                         "  Found:{}".format(method, ''.join(
                             "\n    '{}': {}".format(k, str(dataset[k].dtype)) for k in dataset.row_key)))
Exemple #5
0
 def array_floating_point_divide(arg_type, ret_type):
     register_function("div", (
         arg_type,
         tarray(arg_type),
     ), tarray(ret_type))
     register_function("div", (tarray(arg_type), arg_type),
                       tarray(ret_type))
     register_function("div", (tarray(arg_type), tarray(arg_type)),
                       tarray(ret_type))
Exemple #6
0
def unify_types(*ts):
    limited_unify = unify_types_limited(*ts)
    if limited_unify is not None:
        return limited_unify
    elif all(isinstance(t, tarray) for t in ts):
        et = unify_types_limited(*(t.element_type for t in ts))
        if et is not None:
            return tarray(et)
        else:
            return None
    else:
        return None
Exemple #7
0
def _require_row_variant_w_struct_locus(mt: MatrixTable) -> NoReturn:
    """
    Similar to hail.methods.misc.require_row_key_variant_w_struct_locus, but not necessarily as keys
    """
    assert check_argument_types()

    if (not set(['locus', 'alleles']).issubset(set(mt.rows().row)) or
            not mt['alleles'].dtype == tarray(tstr) or
        (not isinstance(mt['locus'].dtype, tlocus) and
         mt['locus'].dtype != hl.dtype('struct{contig: str, position: int32}'))):
        raise ValueError("'hail.from_matrix_table' requires row to contain two fields 'locus'"
                         " (type 'locus<any>' or 'struct{{contig: str, position: int32}}') and "
                         "'alleles' (type 'array<str>')")
Exemple #8
0
def require_row_key_variant(dataset, method):
    if isinstance(dataset, Table):
        key = dataset.key
    else:
        assert isinstance(dataset, MatrixTable)
        key = dataset.row_key
    if (list(key) != ['locus', 'alleles']
            or not isinstance(dataset['locus'].dtype, tlocus)
            or not dataset['alleles'].dtype == tarray(tstr)):
        raise ValueError("Method '{}' requires row key to be two fields 'locus' (type 'locus<any>') and "
                         "'alleles' (type 'array<str>')\n"
                         "  Found:{}".format(method, ''.join(
                             "\n    '{}': {}".format(k, str(dataset[k].dtype)) for k in key)))
Exemple #9
0
 def _bin_op_numeric(self, name, other, ret_type_f=None):
     other = to_expr(other)
     unified_type = self._bin_op_numeric_unify_types(name, other)
     me = self._promote_numeric(unified_type)
     other = other._promote_numeric(unified_type)
     if ret_type_f:
         if isinstance(unified_type, tarray):
             ret_type = tarray(ret_type_f(unified_type.element_type))
         elif isinstance(unified_type, tndarray):
             ret_type = tndarray(ret_type_f(unified_type.element_type),
                                 unified_type.ndim)
         else:
             ret_type = ret_type_f(unified_type)
     else:
         ret_type = unified_type
     return me._bin_op(name, other, ret_type)
Exemple #10
0
def haplotype_freq_em(gt_counts) -> ArrayExpression:
    """
    Computes estimated haplotype counts based on genotypes for a pair of bi-allelic variants.
    Implements the Excoffier & Slatkin EM (Exccoffier & Slatkin, Mol. Biol. Evol. 1995)

    The unphased input genotype counts for the variant pairs has to be provided in the following order:
    [AABB, AABb, AAbb, AaBB, AaBb, Aabb, aaBB, aaBb, aabb]

    The estimated haplotype counts are returned in an array in the following order:
    [AB, Ab, aB, ab]

    Where _A_ and _a_ are the reference and non-reference alleles for the first variant, resp.
    And _B_ and _b_ are the reference and non-reference alleles for the second variant, resp.

    Parameters
    ----------
    gt_counts : :class:`.ArrayExpression`

    Returns
    -------
    :class:`.ArrayExpression`
    """
    return _func("haplotype_freq_em", tarray(tfloat64), gt_counts)
Exemple #11
0
def pc_relate(call_expr,
              min_individual_maf,
              *,
              k=None,
              scores_expr=None,
              min_kinship=None,
              statistics="all",
              block_size=None,
              include_self_kinship=False) -> Table:
    r"""Compute relatedness estimates between individuals using a variant of the
    PC-Relate method.

    .. include:: ../_templates/req_diploid_gt.rst

    Examples
    --------
    Estimate kinship, identity-by-descent two, identity-by-descent one, and
    identity-by-descent zero for every pair of samples, using a minimum minor
    allele frequency filter of 0.01 and 10 principal components to control
    for population structure.

    >>> rel = hl.pc_relate(dataset.GT, 0.01, k=10)

    Only compute the kinship statistic. This is more efficient than
    computing all statistics.

    >>> rel = hl.pc_relate(dataset.GT, 0.01, k=10, statistics='kin')

    Compute all statistics, excluding sample-pairs with kinship less
    than 0.1. This is more efficient than producing the full table and
    then filtering using :meth:`.Table.filter`.

    >>> rel = hl.pc_relate(dataset.GT, 0.01, k=10, min_kinship=0.1)

    One can also pass in pre-computed principal component scores.
    To produce the same results as in the previous example:

    >>> _, scores_table, _ = hl.hwe_normalized_pca(dataset.GT,
    ...                                      k=10,
    ...                                      compute_loadings=False)
    >>> rel = hl.pc_relate(dataset.GT,
    ...                    0.01,
    ...                    scores_expr=scores_table[dataset.col_key].scores,
    ...                    min_kinship=0.1)

    Notes
    -----
    The traditional estimator for kinship between a pair of individuals
    :math:`i` and :math:`j`, sharing the set :math:`S_{ij}` of
    single-nucleotide variants, from a population with allele frequencies
    :math:`p_s`, is given by:

    .. math::

      \widehat{\phi_{ij}} \coloneqq
        \frac{1}{|S_{ij}|}
        \sum_{s \in S_{ij}}
          \frac{(g_{is} - 2 p_s) (g_{js} - 2 p_s)}
                {4 \sum_{s \in S_{ij}} p_s (1 - p_s)}

    This estimator is true under the model that the sharing of common
    (relative to the population) alleles is not very informative to
    relatedness (because they're common) and the sharing of rare alleles
    suggests a recent common ancestor from which the allele was inherited by
    descent.

    When multiple ancestry groups are mixed in a sample, this model breaks
    down. Alleles that are rare in all but one ancestry group are treated as
    very informative to relatedness. However, these alleles are simply
    markers of the ancestry group. The PC-Relate method corrects for this
    situation and the related situation of admixed individuals.

    PC-Relate slightly modifies the usual estimator for relatedness:
    occurrences of population allele frequency are replaced with an
    "individual-specific allele frequency". This modification allows the
    method to correctly weight an allele according to an individual's unique
    ancestry profile.

    The "individual-specific allele frequency" at a given genetic locus is
    modeled by PC-Relate as a linear function of a sample's first ``k``
    principal component coordinates. As such, the efficacy of this method
    rests on two assumptions:

     - an individual's first `k` principal component coordinates fully
       describe their allele-frequency-relevant ancestry, and

     - the relationship between ancestry (as described by principal
       component coordinates) and population allele frequency is linear

    The estimators for kinship, and identity-by-descent zero, one, and two
    follow. Let:

     - :math:`S_{ij}` be the set of genetic loci at which both individuals
       :math:`i` and :math:`j` have a defined genotype

     - :math:`g_{is} \in {0, 1, 2}` be the number of alternate alleles that
       individual :math:`i` has at genetic locus :math:`s`

     - :math:`\widehat{\mu_{is}} \in [0, 1]` be the individual-specific allele
       frequency for individual :math:`i` at genetic locus :math:`s`

     - :math:`{\widehat{\sigma^2_{is}}} \coloneqq \widehat{\mu_{is}} (1 - \widehat{\mu_{is}})`,
       the binomial variance of :math:`\widehat{\mu_{is}}`

     - :math:`\widehat{\sigma_{is}} \coloneqq \sqrt{\widehat{\sigma^2_{is}}}`,
       the binomial standard deviation of :math:`\widehat{\mu_{is}}`

     - :math:`\text{IBS}^{(0)}_{ij} \coloneqq \sum_{s \in S_{ij}} \mathbb{1}_{||g_{is} - g_{js} = 2||}`,
       the number of genetic loci at which individuals :math:`i` and :math:`j`
       share no alleles

     - :math:`\widehat{f_i} \coloneqq 2 \widehat{\phi_{ii}} - 1`, the inbreeding
       coefficient for individual :math:`i`

     - :math:`g^D_{is}` be a dominance encoding of the genotype matrix, and
       :math:`X_{is}` be a normalized dominance-coded genotype matrix

    .. math::

        g^D_{is} \coloneqq
          \begin{cases}
            \widehat{\mu_{is}}     & g_{is} = 0 \\
            0                        & g_{is} = 1 \\
            1 - \widehat{\mu_{is}} & g_{is} = 2
          \end{cases}

        \qquad
        X_{is} \coloneqq g^D_{is} - \widehat{\sigma^2_{is}} (1 - \widehat{f_i})

    The estimator for kinship is given by:

    .. math::

      \widehat{\phi_{ij}} \coloneqq
        \frac{\sum_{s \in S_{ij}}(g - 2 \mu)_{is} (g - 2 \mu)_{js}}
              {4 * \sum_{s \in S_{ij}}
                            \widehat{\sigma_{is}} \widehat{\sigma_{js}}}

    The estimator for identity-by-descent two is given by:

    .. math::

      \widehat{k^{(2)}_{ij}} \coloneqq
        \frac{\sum_{s \in S_{ij}}X_{is} X_{js}}{\sum_{s \in S_{ij}}
          \widehat{\sigma^2_{is}} \widehat{\sigma^2_{js}}}

    The estimator for identity-by-descent zero is given by:

    .. math::

      \widehat{k^{(0)}_{ij}} \coloneqq
        \begin{cases}
          \frac{\text{IBS}^{(0)}_{ij}}
                {\sum_{s \in S_{ij}}
                       \widehat{\mu_{is}}^2(1 - \widehat{\mu_{js}})^2
                       + (1 - \widehat{\mu_{is}})^2\widehat{\mu_{js}}^2}
            & \widehat{\phi_{ij}} > 2^{-5/2} \\
          1 - 4 \widehat{\phi_{ij}} + k^{(2)}_{ij}
            & \widehat{\phi_{ij}} \le 2^{-5/2}
        \end{cases}

    The estimator for identity-by-descent one is given by:

    .. math::

      \widehat{k^{(1)}_{ij}} \coloneqq
        1 - \widehat{k^{(2)}_{ij}} - \widehat{k^{(0)}_{ij}}

    Note that, even if present, phase information is ignored by this method.

    The PC-Relate method is described in "Model-free Estimation of Recent
    Genetic Relatedness". Conomos MP, Reiner AP, Weir BS, Thornton TA. in
    American Journal of Human Genetics. 2016 Jan 7. The reference
    implementation is available in the `GENESIS Bioconductor package
    <https://bioconductor.org/packages/release/bioc/html/GENESIS.html>`_ .

    :func:`.pc_relate` differs from the reference implementation in a few
    ways:

     - if `k` is supplied, samples scores are computed via PCA on all samples,
       not a specified subset of genetically unrelated samples. The latter
       can be achieved by filtering samples, computing PCA variant loadings,
       and using these loadings to compute and pass in scores for all samples.

     - the estimators do not perform small sample correction

     - the algorithm does not provide an option to use population-wide
       allele frequency estimates

     - the algorithm does not provide an option to not use "overall
       standardization" (see R ``pcrelate`` documentation)

    Under the PC-Relate model, kinship, :math:`\phi_{ij}`, ranges from 0 to
    0.5, and is precisely half of the
    fraction-of-genetic-material-shared. Listed below are the statistics for
    a few pairings:

     - Monozygotic twins share all their genetic material so their kinship
       statistic is 0.5 in expection.

     - Parent-child and sibling pairs both have kinship 0.25 in expectation
       and are separated by the identity-by-descent-zero, :math:`k^{(2)}_{ij}`,
       statistic which is zero for parent-child pairs and 0.25 for sibling
       pairs.

     - Avuncular pairs and grand-parent/-child pairs both have kinship 0.125
       in expectation and both have identity-by-descent-zero 0.5 in expectation

     - "Third degree relatives" are those pairs sharing
       :math:`2^{-3} = 12.5 %` of their genetic material, the results of
       PCRelate are often too noisy to reliably distinguish these pairs from
       higher-degree-relative-pairs or unrelated pairs.

    Note that :math:`g_{is}` is the number of alternate alleles. Hence, for
    multi-allelic variants, a value of 2 may indicate two distinct alternative
    alleles rather than a homozygous variant genotype. To enforce the latter,
    either filter or split multi-allelic variants first.

    The resulting table has the first 3, 4, 5, or 6 fields below, depending on
    the `statistics` parameter:

     - `i` (``col_key.dtype``) -- First sample. (key field)
     - `j` (``col_key.dtype``) -- Second sample. (key field)
     - `kin` (:py:data:`.tfloat64`) -- Kinship estimate, :math:`\widehat{\phi_{ij}}`.
     - `ibd2` (:py:data:`.tfloat64`) -- IBD2 estimate, :math:`\widehat{k^{(2)}_{ij}}`.
     - `ibd0` (:py:data:`.tfloat64`) -- IBD0 estimate, :math:`\widehat{k^{(0)}_{ij}}`.
     - `ibd1` (:py:data:`.tfloat64`) -- IBD1 estimate, :math:`\widehat{k^{(1)}_{ij}}`.

    Here ``col_key`` refers to the column key of the source matrix table,
    and ``col_key.dtype`` is a struct containing the column key fields.

    There is one row for each pair of distinct samples (columns), where `i`
    corresponds to the column of smaller column index. In particular, if the
    same column key value exists for :math:`n` columns, then the resulting
    table will have :math:`\binom{n-1}{2}` rows with both key fields equal to
    that column key value. This may result in unexpected behavior in downstream
    processing.

    Parameters
    ----------
    call_expr : :class:`.CallExpression`
        Entry-indexed call expression.
    min_individual_maf : :obj:`float`
        The minimum individual-specific minor allele frequency.
        If either individual-specific minor allele frequency for a pair of
        individuals is below this threshold, then the variant will not
        be used to estimate relatedness for the pair.
    k : :obj:`int`, optional
        If set, `k` principal component scores are computed and used.
        Exactly one of `k` and `scores_expr` must be specified.
    scores_expr : :class:`.ArrayNumericExpression`, optional
        Column-indexed expression of principal component scores, with the same
        source as `call_expr`. All array values must have the same positive length,
        corresponding to the number of principal components, and all scores must
        be non-missing. Exactly one of `k` and `scores_expr` must be specified.
    min_kinship : :obj:`float`, optional
        If set, pairs of samples with kinship lower than `min_kinship` are excluded
        from the results.
    statistics : :class:`str`
        Set of statistics to compute.
        If ``'kin'``, only estimate the kinship statistic.
        If ``'kin2'``, estimate the above and IBD2.
        If ``'kin20'``, estimate the above and IBD0.
        If ``'all'``, estimate the above and IBD1.
    block_size : :obj:`int`, optional
        Block size of block matrices used in the algorithm.
        Default given by :meth:`.BlockMatrix.default_block_size`.
    include_self_kinship: :obj:`bool`
        If ``True``, include entries for an individual's estimated kinship with
        themselves. Defaults to ``False``.

    Returns
    -------
    :class:`.Table`
        A :class:`.Table` mapping pairs of samples to their pair-wise statistics.
    """
    mt = matrix_table_source('pc_relate/call_expr', call_expr)

    if k and scores_expr is None:
        _, scores, _ = hwe_normalized_pca(call_expr, k, compute_loadings=False)
        scores_expr = scores[mt.col_key].scores
    elif not k and scores_expr is not None:
        analyze('pc_relate/scores_expr', scores_expr, mt._col_indices)
    elif k and scores_expr is not None:
        raise ValueError(
            "pc_relate: exactly one of 'k' and 'scores_expr' must be set, found both"
        )
    else:
        raise ValueError(
            "pc_relate: exactly one of 'k' and 'scores_expr' must be set, found neither"
        )

    scores_table = mt.select_cols(__scores=scores_expr)\
        .key_cols_by().select_cols('__scores').cols()

    n_missing = scores_table.aggregate(
        agg.count_where(hl.is_missing(scores_table.__scores)))
    if n_missing > 0:
        raise ValueError(
            f'Found {n_missing} columns with missing scores array.')

    mt = mt.select_entries(__gt=call_expr.n_alt_alleles()).unfilter_entries()
    mt = mt.annotate_rows(__mean_gt=agg.mean(mt.__gt))
    mean_imputed_gt = hl.or_else(hl.float64(mt.__gt), mt.__mean_gt)

    if not block_size:
        block_size = BlockMatrix.default_block_size()

    g = BlockMatrix.from_entry_expr(mean_imputed_gt, block_size=block_size)

    pcs = scores_table.collect(_localize=False).map(lambda x: x.__scores)

    ht = Table(
        ir.BlockMatrixToTableApply(
            g._bmir, pcs._ir, {
                'name': 'PCRelate',
                'maf': min_individual_maf,
                'blockSize': block_size,
                'minKinship': min_kinship,
                'statistics': {
                    'kin': 0,
                    'kin2': 1,
                    'kin20': 2,
                    'all': 3
                }[statistics]
            }))

    if statistics == 'kin':
        ht = ht.drop('ibd0', 'ibd1', 'ibd2')
    elif statistics == 'kin2':
        ht = ht.drop('ibd0', 'ibd1')
    elif statistics == 'kin20':
        ht = ht.drop('ibd1')

    if not include_self_kinship:
        ht = ht.filter(ht.i == ht.j, keep=False)

    col_keys = hl.literal(mt.select_cols().key_cols_by().cols().collect(),
                          dtype=tarray(mt.col_key.dtype))
    return ht.key_by(i=col_keys[ht.i], j=col_keys[ht.j])
Exemple #12
0
def _to_expr(e, dtype):
    if e is None:
        return None
    elif isinstance(e, Expression):
        if e.dtype != dtype:
            assert is_numeric(dtype), 'expected {}, got {}'.format(
                dtype, e.dtype)
            if dtype == tfloat64:
                return hl.float64(e)
            elif dtype == tfloat32:
                return hl.float32(e)
            elif dtype == tint64:
                return hl.int64(e)
            else:
                assert dtype == tint32
                return hl.int32(e)
        return e
    elif not is_compound(dtype):
        # these are not container types and cannot contain expressions if we got here
        return e
    elif isinstance(dtype, tstruct):
        new_fields = []
        found_expr = False
        for f, t in dtype.items():
            value = _to_expr(e[f], t)
            found_expr = found_expr or isinstance(value, Expression)
            new_fields.append(value)

        if not found_expr:
            return e
        else:
            exprs = [
                new_fields[i] if isinstance(new_fields[i], Expression) else
                hl.literal(new_fields[i], dtype[i])
                for i in range(len(new_fields))
            ]
            fields = {name: expr for name, expr in zip(dtype.keys(), exprs)}
            from .typed_expressions import StructExpression
            return StructExpression._from_fields(fields)

    elif isinstance(dtype, tarray):
        elements = []
        found_expr = False
        for element in e:
            value = _to_expr(element, dtype.element_type)
            found_expr = found_expr or isinstance(value, Expression)
            elements.append(value)
        if not found_expr:
            return e
        else:
            assert len(elements) > 0
            exprs = [
                element if isinstance(element, Expression) else hl.literal(
                    element, dtype.element_type) for element in elements
            ]
            indices, aggregations = unify_all(*exprs)
        x = ir.MakeArray([e._ir for e in exprs], None)
        return expressions.construct_expr(x, dtype, indices, aggregations)
    elif isinstance(dtype, tset):
        elements = []
        found_expr = False
        for element in e:
            value = _to_expr(element, dtype.element_type)
            found_expr = found_expr or isinstance(value, Expression)
            elements.append(value)
        if not found_expr:
            return e
        else:
            assert len(elements) > 0
            exprs = [
                element if isinstance(element, Expression) else hl.literal(
                    element, dtype.element_type) for element in elements
            ]
            indices, aggregations = unify_all(*exprs)
            x = ir.ToSet(
                ir.ToStream(ir.MakeArray([e._ir for e in exprs], None)))
            return expressions.construct_expr(x, dtype, indices, aggregations)
    elif isinstance(dtype, ttuple):
        elements = []
        found_expr = False
        assert len(e) == len(dtype.types)
        for i in range(len(e)):
            value = _to_expr(e[i], dtype.types[i])
            found_expr = found_expr or isinstance(value, Expression)
            elements.append(value)
        if not found_expr:
            return e
        else:
            exprs = [
                elements[i] if isinstance(elements[i], Expression) else
                hl.literal(elements[i], dtype.types[i])
                for i in range(len(elements))
            ]
            indices, aggregations = unify_all(*exprs)
            x = ir.MakeTuple([expr._ir for expr in exprs])
            return expressions.construct_expr(x, dtype, indices, aggregations)
    elif isinstance(dtype, tdict):
        keys = []
        values = []
        found_expr = False
        for k, v in e.items():
            k_ = _to_expr(k, dtype.key_type)
            v_ = _to_expr(v, dtype.value_type)
            found_expr = found_expr or isinstance(k_, Expression)
            found_expr = found_expr or isinstance(v_, Expression)
            keys.append(k_)
            values.append(v_)
        if not found_expr:
            return e
        else:
            assert len(keys) > 0
            # Here I use `to_expr` to call `lit` the keys and values separately.
            # I anticipate a common mode is statically-known keys and Expression
            # values.
            key_array = to_expr(keys, tarray(dtype.key_type))
            value_array = to_expr(values, tarray(dtype.value_type))
            return hl.dict(hl.zip(key_array, value_array))
    elif isinstance(dtype, hl.tndarray):
        return hl.nd.array(e)
    else:
        raise NotImplementedError(dtype)
Exemple #13
0
def _impute_type(x, partial_type):
    from hail.genetics import Locus, Call
    from hail.utils import Interval, Struct

    def refine(t, refined):
        if t is None:
            return refined
        if not isinstance(t, type(refined)):
            raise ExpressionException(
                "Incompatible partial_type, {}, for value {}".format(
                    partial_type, x))
        return t

    if isinstance(x, Expression):
        return x.dtype
    elif isinstance(x, bool):
        return tbool
    elif isinstance(x, int):
        if hl.tint32.min_value <= x <= hl.tint32.max_value:
            return tint32
        elif hl.tint64.min_value <= x <= hl.tint64.max_value:
            return tint64
        else:
            raise ValueError(
                "Hail has no integer data type large enough to store {}".
                format(x))
    elif isinstance(x, float):
        return tfloat64
    elif isinstance(x, str):
        return tstr
    elif isinstance(x, Locus):
        return tlocus(x.reference_genome)
    elif isinstance(x, Interval):
        return tinterval(x.point_type)
    elif isinstance(x, Call):
        return tcall
    elif isinstance(x, Struct) or isinstance(x, dict) and isinstance(
            partial_type, tstruct):
        partial_type = refine(partial_type, hl.tstruct())
        t = tstruct(**{k: _impute_type(x[k], partial_type.get(k)) for k in x})
        return t
    elif isinstance(x, tuple):
        partial_type = refine(partial_type, hl.ttuple())
        return ttuple(*[
            _impute_type(
                element,
                partial_type[index] if index < len(partial_type) else None)
            for index, element in enumerate(x)
        ])
    elif isinstance(x, list):
        partial_type = refine(partial_type, hl.tarray(None))
        if len(x) == 0:
            return partial_type
        ts = {
            _impute_type(element, partial_type.element_type)
            for element in x
        }
        unified_type = super_unify_types(*ts)
        if unified_type is None:
            raise ExpressionException(
                "Hail does not support heterogeneous arrays: "
                "found list with elements of types {} ".format(list(ts)))
        return tarray(unified_type)

    elif is_setlike(x):
        partial_type = refine(partial_type, hl.tset(None))
        if len(x) == 0:
            return partial_type
        ts = {
            _impute_type(element, partial_type.element_type)
            for element in x
        }
        unified_type = super_unify_types(*ts)
        if not unified_type:
            raise ExpressionException(
                "Hail does not support heterogeneous sets: "
                "found set with elements of types {} ".format(list(ts)))
        return tset(unified_type)

    elif isinstance(x, Mapping):
        user_partial_type = partial_type
        partial_type = refine(partial_type, hl.tdict(None, None))
        if len(x) == 0:
            return partial_type
        kts = {
            _impute_type(element, partial_type.key_type)
            for element in x.keys()
        }
        vts = {
            _impute_type(element, partial_type.value_type)
            for element in x.values()
        }
        unified_key_type = super_unify_types(*kts)
        unified_value_type = super_unify_types(*vts)
        if not unified_key_type:
            raise ExpressionException(
                "Hail does not support heterogeneous dicts: "
                "found dict with keys {} of types {} ".format(
                    list(x.keys()), list(kts)))
        if not unified_value_type:
            if unified_key_type == hl.tstr and user_partial_type is None:
                return tstruct(**{k: _impute_type(x[k], None) for k in x})

            raise ExpressionException(
                "Hail does not support heterogeneous dicts: "
                "found dict with values of types {} ".format(list(vts)))
        return tdict(unified_key_type, unified_value_type)
    elif isinstance(x, np.generic):
        return from_numpy(x.dtype)
    elif isinstance(x, np.ndarray):
        element_type = from_numpy(x.dtype)
        return tndarray(element_type, x.ndim)
    elif x is None or pd.isna(x):
        return partial_type
    elif isinstance(
            x, (hl.expr.builders.CaseBuilder, hl.expr.builders.SwitchBuilder)):
        raise ExpressionException(
            "'switch' and 'case' expressions must end with a call to either"
            "'default' or 'or_missing'")
    else:
        raise ExpressionException(
            "Hail cannot automatically impute type of {}: {}".format(
                type(x), x))
Exemple #14
0
def impute_type(x):
    from hail.genetics import Locus, Call
    from hail.utils import Interval, Struct

    if isinstance(x, Expression):
        return x.dtype
    elif isinstance(x, bool):
        return tbool
    elif isinstance(x, int):
        if hl.tint32.min_value <= x <= hl.tint32.max_value:
            return tint32
        elif hl.tint64.min_value <= x <= hl.tint64.max_value:
            return tint64
        else:
            raise ValueError(
                "Hail has no integer data type large enough to store {}".
                format(x))
    elif isinstance(x, float):
        return tfloat64
    elif isinstance(x, str):
        return tstr
    elif isinstance(x, Locus):
        return tlocus(x.reference_genome)
    elif isinstance(x, Interval):
        return tinterval(x.point_type)
    elif isinstance(x, Call):
        return tcall
    elif isinstance(x, Struct):
        return tstruct(**{k: impute_type(x[k]) for k in x})
    elif isinstance(x, tuple):
        return ttuple(*(impute_type(element) for element in x))
    elif isinstance(x, list):
        if len(x) == 0:
            raise ExpressionException(
                "Cannot impute type of empty list. Use 'hl.empty_array' to create an empty array."
            )
        ts = {impute_type(element) for element in x}
        unified_type = unify_types_limited(*ts)
        if unified_type is None:
            raise ExpressionException(
                "Hail does not support heterogeneous arrays: "
                "found list with elements of types {} ".format(list(ts)))
        return tarray(unified_type)
    elif isinstance(x, set):
        if len(x) == 0:
            raise ExpressionException(
                "Cannot impute type of empty set. Use 'hl.empty_set' to create an empty set."
            )
        ts = {impute_type(element) for element in x}
        unified_type = unify_types_limited(*ts)
        if not unified_type:
            raise ExpressionException(
                "Hail does not support heterogeneous sets: "
                "found set with elements of types {} ".format(list(ts)))
        return tset(unified_type)
    elif isinstance(x, Mapping):
        if len(x) == 0:
            raise ExpressionException(
                "Cannot impute type of empty dict. Use 'hl.empty_dict' to create an empty dict."
            )
        kts = {impute_type(element) for element in x.keys()}
        vts = {impute_type(element) for element in x.values()}
        unified_key_type = unify_types_limited(*kts)
        unified_value_type = unify_types_limited(*vts)
        if not unified_key_type:
            raise ExpressionException(
                "Hail does not support heterogeneous dicts: "
                "found dict with keys of types {} ".format(list(kts)))
        if not unified_value_type:
            raise ExpressionException(
                "Hail does not support heterogeneous dicts: "
                "found dict with values of types {} ".format(list(vts)))
        return tdict(unified_key_type, unified_value_type)
    elif isinstance(x, np.generic):
        return from_numpy(x.dtype)
    elif isinstance(x, np.ndarray):
        element_type = from_numpy(x.dtype)
        return tndarray(element_type, x.ndim)
    elif x is None:
        raise ExpressionException("Hail cannot impute the type of 'None'")
    elif isinstance(
            x, (hl.expr.builders.CaseBuilder, hl.expr.builders.SwitchBuilder)):
        raise ExpressionException(
            "'switch' and 'case' expressions must end with a call to either"
            "'default' or 'or_missing'")
    else:
        raise ExpressionException(
            "Hail cannot automatically impute type of {}: {}".format(
                type(x), x))