コード例 #1
0
ファイル: aggregators.py プロジェクト: ahiduchick/hail
    def _group_by(self, group, agg_expr):
        if group._aggregations:
            raise ExpressionException(
                "'group_by' does not support an already-aggregated expression as the argument to 'group'"
            )

        if isinstance(agg_expr._ir, ApplyScanOp):
            if not self._as_scan:
                raise TypeError(
                    "'agg.group_by' requires a non-scan aggregation expression (agg.*) as the argument to 'agg_expr'"
                )
        elif isinstance(agg_expr._ir, ApplyAggOp):
            if self._as_scan:
                raise TypeError(
                    "'scan.group_by' requires a scan aggregation expression (scan.*) as the argument to 'agg_expr'"
                )
        elif not isinstance(agg_expr._ir, ApplyAggOp) and not isinstance(
                agg_expr._ir, ApplyScanOp):
            raise TypeError(
                "'group_by' requires an aggregation expression as the argument to 'agg_expr'"
            )

        ir = agg_expr._ir
        agg_sig = ir.agg_sig
        a = ir.a

        new_agg_sig = AggSignature(f'Keyed({agg_sig.op})',
                                   agg_sig.ctor_arg_types,
                                   agg_sig.initop_arg_types,
                                   [group.dtype] + agg_sig.seqop_arg_types)

        def rewrite_a(ir):
            if isinstance(ir, SeqOp):
                return SeqOp(ir.i, [group._ir] + ir.args, new_agg_sig)
            else:
                return ir.map_ir(rewrite_a)

        new_a = rewrite_a(a)

        if isinstance(agg_expr._ir, ApplyAggOp):
            ir = ApplyAggOp(new_a, ir.constructor_args, ir.init_op_args,
                            new_agg_sig)
        else:
            assert isinstance(agg_expr._ir, ApplyScanOp)
            ir = ApplyScanOp(new_a, ir.constructor_args, ir.init_op_args,
                             new_agg_sig)

        return construct_expr(ir, hl.tdict(group.dtype, agg_expr.dtype),
                              agg_expr._indices, agg_expr._aggregations)
コード例 #2
0
 def values(self):
     values = [(hl.tbool, True), (hl.tint32, 0), (hl.tint64, 0),
               (hl.tfloat32, 0.5), (hl.tfloat64, 0.5), (hl.tstr, "foo"),
               (hl.tstruct(x=hl.tint32), hl.Struct(x=0)),
               (hl.tarray(hl.tint32), [0, 1, 4]),
               (hl.tset(hl.tint32), {0, 1, 4}),
               (hl.tdict(hl.tstr, hl.tint32), {
                   "a": 0,
                   "b": 1,
                   "c": 4
               }), (hl.tinterval(hl.tint32), hl.Interval(0, 1, True,
                                                         False)),
               (hl.tlocus(hl.default_reference()), hl.Locus("1", 1)),
               (hl.tcall, hl.Call([0, 1]))]
     return values
コード例 #3
0
ファイル: test_ir.py プロジェクト: danking/hail
 def values(self):
     values = [
         (hl.tbool, True),
         (hl.tint32, 0),
         (hl.tint64, 0),
         (hl.tfloat32, 0.5),
         (hl.tfloat64, 0.5),
         (hl.tstr, "foo"),
         (hl.tstruct(x=hl.tint32), hl.Struct(x=0)),
         (hl.tarray(hl.tint32), [0, 1, 4]),
         (hl.tset(hl.tint32), {0, 1, 4}),
         (hl.tdict(hl.tstr, hl.tint32), {"a": 0, "b": 1, "c": 4}),
         (hl.tinterval(hl.tint32), hl.Interval(0, 1, True, False)),
         (hl.tlocus(hl.default_reference()), hl.Locus("1", 1)),
         (hl.tcall, hl.Call([0, 1]))
     ]
     return values
コード例 #4
0
def create_all_values():
    return hl.struct(
        f32=hl.float32(3.14),
        i64=hl.int64(-9),
        m=hl.null(hl.tfloat64),
        astruct=hl.struct(a=hl.null(hl.tint32), b=5.5),
        mstruct=hl.null(hl.tstruct(x=hl.tint32, y=hl.tstr)),
        aset=hl.set(['foo', 'bar', 'baz']),
        mset=hl.null(hl.tset(hl.tfloat64)),
        d=hl.dict({hl.array(['a', 'b']): 0.5, hl.array(['x', hl.null(hl.tstr), 'z']): 0.3}),
        md=hl.null(hl.tdict(hl.tint32, hl.tstr)),
        h38=hl.locus('chr22', 33878978, 'GRCh38'),
        ml=hl.null(hl.tlocus('GRCh37')),
        i=hl.interval(
            hl.locus('1', 999),
            hl.locus('1', 1001)),
        c=hl.call(0, 1),
        mc=hl.null(hl.tcall),
        t=hl.tuple([hl.call(1, 2, phased=True), 'foo', hl.null(hl.tstr)]),
        mt=hl.null(hl.ttuple(hl.tlocus('GRCh37'), hl.tbool))
    )
コード例 #5
0
ファイル: helpers.py プロジェクト: jigold/hail
def create_all_values():
    return hl.struct(
        f32=hl.float32(3.14),
        i64=hl.int64(-9),
        m=hl.null(hl.tfloat64),
        astruct=hl.struct(a=hl.null(hl.tint32), b=5.5),
        mstruct=hl.null(hl.tstruct(x=hl.tint32, y=hl.tstr)),
        aset=hl.set(['foo', 'bar', 'baz']),
        mset=hl.null(hl.tset(hl.tfloat64)),
        d=hl.dict({hl.array(['a', 'b']): 0.5, hl.array(['x', hl.null(hl.tstr), 'z']): 0.3}),
        md=hl.null(hl.tdict(hl.tint32, hl.tstr)),
        h38=hl.locus('chr22', 33878978, 'GRCh38'),
        ml=hl.null(hl.tlocus('GRCh37')),
        i=hl.interval(
            hl.locus('1', 999),
            hl.locus('1', 1001)),
        c=hl.call(0, 1),
        mc=hl.null(hl.tcall),
        t=hl.tuple([hl.call(1, 2, phased=True), 'foo', hl.null(hl.tstr)]),
        mt=hl.null(hl.ttuple(hl.tlocus('GRCh37'), hl.tbool))
    )
コード例 #6
0
ファイル: test_file_formats.py プロジェクト: bcajes/hail
def create_all_values_datasets():
    all_values = hl.struct(
        f32=hl.float32(3.14),
        i64=hl.int64(-9),
        m=hl.null(hl.tfloat64),
        astruct=hl.struct(a=hl.null(hl.tint32), b=5.5),
        mstruct=hl.null(hl.tstruct(x=hl.tint32, y=hl.tstr)),
        aset=hl.set(['foo', 'bar', 'baz']),
        mset=hl.null(hl.tset(hl.tfloat64)),
        d=hl.dict({hl.array(['a', 'b']): 0.5, hl.array(['x', hl.null(hl.tstr), 'z']): 0.3}),
        md=hl.null(hl.tdict(hl.tint32, hl.tstr)),
        h38=hl.locus('chr22', 33878978, 'GRCh38'),
        ml=hl.null(hl.tlocus('GRCh37')),
        i=hl.interval(
            hl.locus('1', 999),
            hl.locus('1', 1001)),
        c=hl.call(0, 1),
        mc=hl.null(hl.tcall),
        t=hl.tuple([hl.call(1, 2, phased=True), 'foo', hl.null(hl.tstr)]),
        mt=hl.null(hl.ttuple(hl.tlocus('GRCh37'), hl.tbool))
    )

    def prefix(s, p):
        return hl.struct(**{p + k: s[k] for k in s})

    all_values_table = (hl.utils.range_table(5, n_partitions=3)
                        .annotate_globals(**prefix(all_values, 'global_'))
                        .annotate(**all_values)
                        .cache())

    all_values_matrix_table = (hl.utils.range_matrix_table(3, 2, n_partitions=2)
                               .annotate_globals(**prefix(all_values, 'global_'))
                               .annotate_rows(**prefix(all_values, 'row_'))
                               .annotate_cols(**prefix(all_values, 'col_'))
                               .annotate_entries(**prefix(all_values, 'entry_'))
                               .cache())

    return all_values_table, all_values_matrix_table
コード例 #7
0
def create_all_values_datasets():
    all_values = hl.struct(
        f32=hl.float32(3.14),
        i64=hl.int64(-9),
        m=hl.null(hl.tfloat64),
        astruct=hl.struct(a=hl.null(hl.tint32), b=5.5),
        mstruct=hl.null(hl.tstruct(x=hl.tint32, y=hl.tstr)),
        aset=hl.set(['foo', 'bar', 'baz']),
        mset=hl.null(hl.tset(hl.tfloat64)),
        d=hl.dict({
            hl.array(['a', 'b']): 0.5,
            hl.array(['x', hl.null(hl.tstr), 'z']): 0.3
        }),
        md=hl.null(hl.tdict(hl.tint32, hl.tstr)),
        h38=hl.locus('chr22', 33878978, 'GRCh38'),
        ml=hl.null(hl.tlocus('GRCh37')),
        i=hl.interval(hl.locus('1', 999), hl.locus('1', 1001)),
        c=hl.call(0, 1),
        mc=hl.null(hl.tcall),
        t=hl.tuple([hl.call(1, 2, phased=True), 'foo',
                    hl.null(hl.tstr)]),
        mt=hl.null(hl.ttuple(hl.tlocus('GRCh37'), hl.tbool)))

    def prefix(s, p):
        return hl.struct(**{p + k: s[k] for k in s})

    all_values_table = (hl.utils.range_table(
        5, n_partitions=3).annotate_globals(
            **prefix(all_values, 'global_')).annotate(**all_values).cache())

    all_values_matrix_table = (hl.utils.range_matrix_table(
        3, 2, n_partitions=2).annotate_globals(
            **prefix(all_values, 'global_')).annotate_rows(
                **prefix(all_values, 'row_')).annotate_cols(
                    **prefix(all_values, 'col_')).annotate_entries(
                        **prefix(all_values, 'entry_')).cache())

    return all_values_table, all_values_matrix_table
コード例 #8
0
ファイル: type_parsing.py プロジェクト: zscu/hail
 def visit_dict(self, node, visited_children):
     tdict, _, angle_bracket, kt, comma, vt, angle_bracket = visited_children
     return hl.tdict(kt, vt)
コード例 #9
0
ファイル: base_expression.py プロジェクト: chrisvittal/hail
def _impute_type(x, partial_type):
    from hail.genetics import Locus, Call
    from hail.utils import Interval, Struct

    def refine(t, refined):
        if t is None:
            return refined
        if not isinstance(t, type(refined)):
            raise ExpressionException(
                "Incompatible partial_type, {}, for value {}".format(
                    partial_type, x))
        return t

    if isinstance(x, Expression):
        return x.dtype
    elif isinstance(x, bool):
        return tbool
    elif isinstance(x, int):
        if hl.tint32.min_value <= x <= hl.tint32.max_value:
            return tint32
        elif hl.tint64.min_value <= x <= hl.tint64.max_value:
            return tint64
        else:
            raise ValueError(
                "Hail has no integer data type large enough to store {}".
                format(x))
    elif isinstance(x, float):
        return tfloat64
    elif isinstance(x, str):
        return tstr
    elif isinstance(x, Locus):
        return tlocus(x.reference_genome)
    elif isinstance(x, Interval):
        return tinterval(x.point_type)
    elif isinstance(x, Call):
        return tcall
    elif isinstance(x, Struct) or isinstance(x, dict) and isinstance(
            partial_type, tstruct):
        partial_type = refine(partial_type, hl.tstruct())
        t = tstruct(**{k: _impute_type(x[k], partial_type.get(k)) for k in x})
        return t
    elif isinstance(x, tuple):
        partial_type = refine(partial_type, hl.ttuple())
        return ttuple(*[
            _impute_type(
                element,
                partial_type[index] if index < len(partial_type) else None)
            for index, element in enumerate(x)
        ])
    elif isinstance(x, list):
        partial_type = refine(partial_type, hl.tarray(None))
        if len(x) == 0:
            return partial_type
        ts = {
            _impute_type(element, partial_type.element_type)
            for element in x
        }
        unified_type = super_unify_types(*ts)
        if unified_type is None:
            raise ExpressionException(
                "Hail does not support heterogeneous arrays: "
                "found list with elements of types {} ".format(list(ts)))
        return tarray(unified_type)

    elif is_setlike(x):
        partial_type = refine(partial_type, hl.tset(None))
        if len(x) == 0:
            return partial_type
        ts = {
            _impute_type(element, partial_type.element_type)
            for element in x
        }
        unified_type = super_unify_types(*ts)
        if not unified_type:
            raise ExpressionException(
                "Hail does not support heterogeneous sets: "
                "found set with elements of types {} ".format(list(ts)))
        return tset(unified_type)

    elif isinstance(x, Mapping):
        user_partial_type = partial_type
        partial_type = refine(partial_type, hl.tdict(None, None))
        if len(x) == 0:
            return partial_type
        kts = {
            _impute_type(element, partial_type.key_type)
            for element in x.keys()
        }
        vts = {
            _impute_type(element, partial_type.value_type)
            for element in x.values()
        }
        unified_key_type = super_unify_types(*kts)
        unified_value_type = super_unify_types(*vts)
        if not unified_key_type:
            raise ExpressionException(
                "Hail does not support heterogeneous dicts: "
                "found dict with keys {} of types {} ".format(
                    list(x.keys()), list(kts)))
        if not unified_value_type:
            if unified_key_type == hl.tstr and user_partial_type is None:
                return tstruct(**{k: _impute_type(x[k], None) for k in x})

            raise ExpressionException(
                "Hail does not support heterogeneous dicts: "
                "found dict with values of types {} ".format(list(vts)))
        return tdict(unified_key_type, unified_value_type)
    elif isinstance(x, np.generic):
        return from_numpy(x.dtype)
    elif isinstance(x, np.ndarray):
        element_type = from_numpy(x.dtype)
        return tndarray(element_type, x.ndim)
    elif x is None or pd.isna(x):
        return partial_type
    elif isinstance(
            x, (hl.expr.builders.CaseBuilder, hl.expr.builders.SwitchBuilder)):
        raise ExpressionException(
            "'switch' and 'case' expressions must end with a call to either"
            "'default' or 'or_missing'")
    else:
        raise ExpressionException(
            "Hail cannot automatically impute type of {}: {}".format(
                type(x), x))
コード例 #10
0
def compute_quantile_bin(
    ht: hl.Table,
    score_expr: hl.expr.NumericExpression,
    bin_expr: Dict[str, hl.expr.BooleanExpression] = {"bin": True},
    compute_snv_indel_separately: bool = True,
    n_bins: int = 100,
    k: int = 1000,
    desc: bool = True,
) -> hl.Table:
    """
    Returns a table with a bin for each row based on quantiles of `score_expr`.
    The bin is computed by dividing the `score_expr` into `n_bins` bins containing an equal number of elements.
    This is done based on quantiles computed with hl.agg.approx_quantiles. If a single value in `score_expr` spans more
    than one bin, the rows with this value are distributed randomly across the bins it spans.
    If `compute_snv_indel_separately` is True all items in `bin_expr` will be stratified by snv / indels for the bin
    calculation. Because SNV and indel rows are mutually exclusive, they are re-combined into a single annotation. For
    example if we have the following four variants and scores and `n_bins` of 2:
    ========   =======   ======   =================   =================
    Variant    Type      Score    bin - `compute_snv_indel_separately`:
    --------   -------   ------   -------------------------------------
    \          \         \        False               True
    ========   =======   ======   =================   =================
    Var1       SNV       0.1      1                   1
    Var2       SNV       0.2      1                   2
    Var3       Indel     0.3      2                   1
    Var4       Indel     0.4      2                   2
    ========   =======   ======   =================   =================
    .. note::
        The `bin_expr` defines which data the bin(s) should be computed on. E.g., to get a biallelic quantile bin and an
        singleton quantile bin, the following could be used:
        .. code-block:: python
            bin_expr={
                'biallelic_bin': ~ht.was_split,
                'singleton_bin': ht.singleton
            }
    :param ht: Input Table
    :param score_expr: Expression containing the score
    :param bin_expr: Quantile bin(s) to be computed (see notes)
    :param compute_snv_indel_separately: Should all `bin_expr` items be stratified by snv / indels
    :param n_bins: Number of bins to bin the data into
    :param k: The `k` parameter of approx_quantiles
    :param desc: Whether to bin the score in descending order
    :return: Table with the quantile bins
    """
    import math

    def quantiles_to_bin_boundaries(quantiles: List[int]) -> Dict:
        """
        Merges bins with the same boundaries into a unique bin while keeping track of
        which bins have been merged and the global index of all bins.
        :param quantiles: Original bins boundaries
        :return: (dict of the indices of bins for which multiple bins were collapsed -> number of bins collapsed,
                  Global indices of merged bins,
                  Merged bins boundaries)
        """

        # Pad the quantiles to create boundaries for the first and last bins
        bin_boundaries = [-math.inf] + quantiles + [math.inf]
        merged_bins = defaultdict(int)

        # If every quantile has a unique value, then bin boudaries are unique
        # and can be passed to binary_search as-is
        if len(quantiles) == len(set(quantiles)):
            return dict(
                merged_bins=merged_bins,
                global_bin_indices=list(range(len(bin_boundaries))),
                bin_boundaries=bin_boundaries,
            )

        indexed_bins = list(enumerate(bin_boundaries))
        i = 1
        while i < len(indexed_bins):
            if indexed_bins[i - 1][1] == indexed_bins[i][1]:
                merged_bins[i - 1] += 1
                indexed_bins.pop(i)
            else:
                i += 1

        return dict(
            merged_bins=merged_bins,
            global_bin_indices=[x[0] for x in indexed_bins],
            bin_boundaries=[x[1] for x in indexed_bins],
        )

    if compute_snv_indel_separately:
        # For each bin, add a SNV / indel stratification
        bin_expr = {
            f"{bin_id}_{snv}": (bin_expr & snv_expr)
            for bin_id, bin_expr in bin_expr.items() for snv, snv_expr in [
                ("snv", hl.is_snp(ht.alleles[0], ht.alleles[1])),
                ("indel", ~hl.is_snp(ht.alleles[0], ht.alleles[1])),
            ]
        }
        print("ADSADSADASDAS")
        print(bin_expr)

    bin_ht = ht.annotate(
        **{
            f"_filter_{bin_id}": bin_expr
            for bin_id, bin_expr in bin_expr.items()
        },
        _score=score_expr,
        snv=hl.is_snp(ht.alleles[0], ht.alleles[1]),
    )
    print(bin_ht.show())
    logger.info(
        f"Adding quantile bins using approximate_quantiles binned into {n_bins}, using k={k}"
    )
    bin_stats = bin_ht.aggregate(
        hl.struct(
            **{
                bin_id: hl.agg.filter(
                    bin_ht[f"_filter_{bin_id}"],
                    hl.struct(
                        n=hl.agg.count(),
                        quantiles=hl.agg.approx_quantiles(
                            bin_ht._score,
                            [x / (n_bins) for x in range(1, n_bins)],
                            k=k),
                    ),
                )
                for bin_id in bin_expr
            }))

    # Take care of bins with duplicated boundaries
    bin_stats = bin_stats.annotate(
        **{
            rname: bin_stats[rname].annotate(
                **quantiles_to_bin_boundaries(bin_stats[rname].quantiles))
            for rname in bin_stats
        })

    bin_ht = bin_ht.annotate_globals(bin_stats=hl.literal(
        bin_stats,
        dtype=hl.tstruct(
            **{
                bin_id: hl.tstruct(
                    n=hl.tint64,
                    quantiles=hl.tarray(hl.tfloat64),
                    bin_boundaries=hl.tarray(hl.tfloat64),
                    global_bin_indices=hl.tarray(hl.tint32),
                    merged_bins=hl.tdict(hl.tint32, hl.tint32),
                )
                for bin_id in bin_expr
            }),
    ))

    # Annotate the bin as the index in the unique boundaries array
    bin_ht = bin_ht.annotate(
        **{
            bin_id: hl.or_missing(
                bin_ht[f"_filter_{bin_id}"],
                hl.binary_search(bin_ht.bin_stats[bin_id].bin_boundaries,
                                 bin_ht._score),
            )
            for bin_id in bin_expr
        })

    # Convert the bin to global bin by expanding merged bins, that is:
    # If a value falls in a bin that needs expansion, assign it randomly to one of the expanded bins
    # Otherwise, simply modify the bin to its global index (with expanded bins that is)
    bin_ht = bin_ht.select(
        "snv",
        **{
            bin_id: hl.if_else(
                bin_ht.bin_stats[bin_id].merged_bins.contains(bin_ht[bin_id]),
                bin_ht.bin_stats[bin_id].global_bin_indices[bin_ht[bin_id]] +
                hl.int(
                    hl.rand_unif(
                        0, bin_ht.bin_stats[bin_id].merged_bins[bin_ht[bin_id]]
                        + 1)),
                bin_ht.bin_stats[bin_id].global_bin_indices[bin_ht[bin_id]],
            )
            for bin_id in bin_expr
        },
    )

    if desc:
        bin_ht = bin_ht.annotate(
            **{bin_id: n_bins - bin_ht[bin_id]
               for bin_id in bin_expr})

    # Because SNV and indel rows are mutually exclusive, re-combine them into a single bin.
    # Update the global bin_stats struct to reflect the change in bin names in the table
    if compute_snv_indel_separately:
        bin_expr_no_snv = {
            bin_id.rsplit("_", 1)[0]
            for bin_id in bin_ht.bin_stats
        }
        bin_ht = bin_ht.annotate_globals(bin_stats=hl.struct(
            **{
                bin_id: hl.struct(
                    **{
                        snv: bin_ht.bin_stats[f"{bin_id}_{snv}"]
                        for snv in ["snv", "indel"]
                    })
                for bin_id in bin_expr_no_snv
            }))

        bin_ht = bin_ht.transmute(
            **{
                bin_id: hl.if_else(
                    bin_ht.snv,
                    bin_ht[f"{bin_id}_snv"],
                    bin_ht[f"{bin_id}_indel"],
                )
                for bin_id in bin_expr_no_snv
            })

    return bin_ht
コード例 #11
0
ファイル: type_parsing.py プロジェクト: jigold/hail
 def visit_dict(self, node, visited_children):
     tdict, _, angle_bracket, kt, comma, vt, angle_bracket = visited_children
     return hl.tdict(kt, vt)