Beispiel #1
0
    def compute_same_hap_log_like(n, p, q, x):
        res = (
            hl.cond(
                q > 0,
                hl.fold(
                    lambda i, j: i + j[0] * j[1], 0.0,
                    hl.zip(gt_counts, [
                        hl.log10(x) * 2,
                        hl.log10(2 * x * e),
                        hl.log10(e) * 2,
                        hl.log10(2 * x * p),
                        hl.log10(2 * (p * e + x * q)),
                        hl.log10(2 * q * e),
                        hl.log10(p) * 2,
                        hl.log10(2 * p * q),
                        hl.log10(q) * 2
                    ])),
                -1e31  # Very large negative value if no q is present
            ))

        # If desired, add distance posterior based on value derived from regression
        if distance is not None:
            res = res + hl.max(-6,
                               hl.log10(0.97 - 0.03 * hl.log(distance + 1)))

        return res
Beispiel #2
0
def vep_protein_domain_ann_expr(
        s: hl.expr.StringExpression) -> hl.expr.DictExpression:
    """
    Parse and annotate protein domain(s) from VEP annotation.
    Expected StringExpression as input (e.g. 'Pfam:PF13853&Prints:PR00237&PROSITE_profiles:PS50262')
    It will generate a dict<k,v> where keys (k) represent source/database and values (v) the annotated domain_id.

    :param s: hl.expr.StringExpression
    :return: hl.expr.DictExpression
    """
    a1 = s.split(delim="&")

    # keep only well-annotated domain(s) (i.e. <source:domain_id>)
    a2 = a1.map(lambda x: x.split(delim=":"))
    a2 = a2.filter(lambda x: x.length() == 2)

    d = (
        hl.case().when(
            hl.len(a2) > 0,
            hl.dict(
                hl.zip(
                    a2.map(lambda x: x[0]
                           ),  # TODO: Optimize by scanning array just one.
                    a2.map(lambda x: x[1])))).or_missing())

    return d
def add_popmax_expr(freq: hl.expr.ArrayExpression,
                    freq_meta: hl.expr.ArrayExpression,
                    populations: Set[str]) -> hl.expr.ArrayExpression:
    """
    Calculates popmax (add an additional entry into freq with popmax: pop)

    :param ArrayExpression freq: ArrayExpression of Structs with ['ac', 'an', 'hom']
    :param ArrayExpression freq_meta: ArrayExpression of meta dictionaries corresponding to freq
    :param set of str populations: Set of populations over which to calculate popmax
    :return: Frequency data with annotated popmax
    :rtype: ArrayExpression
    """
    pops_to_use = hl.literal(populations)
    freq = hl.map(lambda x: x[0].annotate(meta=x[1]), hl.zip(freq, freq_meta))
    freq_filtered = hl.filter(
        lambda f: (f.meta.size() == 2) & (f.meta.get('group') == 'adj') &
        pops_to_use.contains(f.meta.get('pop')) & (f.AC > 0), freq)
    sorted_freqs = hl.sorted(freq_filtered, key=lambda x: x.AF, reverse=True)
    return hl.or_missing(
        hl.len(sorted_freqs) > 0,
        hl.struct(AC=sorted_freqs[0].AC,
                  AF=sorted_freqs[0].AF,
                  AN=sorted_freqs[0].AN,
                  homozygote_count=sorted_freqs[0].homozygote_count,
                  pop=sorted_freqs[0].meta['pop']))
Beispiel #4
0
def to_dense_mt(vds: 'VariantDataset') -> 'MatrixTable':
    """Creates a single, dense :class:`.MatrixTable` from the split
    :class:`.VariantDataset` representation.

    Parameters
    ----------
    vds : :class:`.VariantDataset`
        Dataset in VariantDataset representation.

    Returns
    -------
    :class:`.MatrixTable`
        Dataset in dense MatrixTable representation.
    """
    ref = vds.reference_data
    ref = ref.drop(*(x for x in ('alleles', 'rsid') if x in ref.row))
    var = vds.variant_data
    refl = ref.localize_entries('_ref_entries')
    varl = var.localize_entries('_var_entries', '_var_cols')
    varl = varl.annotate(_variant_defined=True)
    joined = refl.join(varl.key_by('locus'), how='outer')
    dr = joined.annotate(dense_ref=hl.or_missing(
        joined._variant_defined,
        hl.scan._densify(hl.len(joined._var_cols), joined._ref_entries)))
    dr = dr.filter(dr._variant_defined)

    def coalesce_join(ref, var):

        call_field = 'GT' if 'GT' in var else 'LGT'
        assert call_field in var, var.dtype

        shared_fields = [call_field] + list(
            f for f in ref.dtype if f in var.dtype)
        shared_field_set = set(shared_fields)
        var_fields = [f for f in var.dtype if f not in shared_field_set]

        return hl.if_else(
            hl.is_defined(var), var.select(*shared_fields, *var_fields),
            ref.annotate(**{
                call_field: hl.call(0, 0)
            }).select(*shared_fields,
                      **{f: hl.null(var[f].dtype)
                         for f in var_fields}))

    dr = dr.annotate(_dense=hl.zip(
        dr._var_entries, dr.dense_ref).map(lambda tuple: coalesce_join(
            hl.or_missing(tuple[1].END > dr.locus.position, tuple[1]), tuple[0]
        )), )

    dr = dr._key_by_assert_sorted('locus', 'alleles')
    dr = dr.drop('_var_entries', '_ref_entries', 'dense_ref',
                 '_variant_defined', 'ref_allele')
    return dr._unlocalize_entries('_dense', '_var_cols', list(var.col_key))
Beispiel #5
0
def post_process_gene_map_ht(gene_ht):
    groups = [
        'pLoF', 'missense|LC', 'pLoF|missense|LC', 'synonymous', 'missense'
    ]
    variant_groups = hl.map(
        lambda group: group.split('\\|').flatmap(lambda csq: gene_ht.variants.
                                                 get(csq)), groups)
    gene_ht = gene_ht.transmute(variant_groups=hl.zip(
        groups, variant_groups)).explode('variant_groups')
    gene_ht = gene_ht.transmute(annotation=gene_ht.variant_groups[0],
                                variants=hl.sorted(gene_ht.variant_groups[1]))
    gene_ht = gene_ht.key_by(start=gene_ht.interval.start)
    return gene_ht.filter(hl.len(gene_ht.variants) > 0)
Beispiel #6
0
    def test(self):
        schema = hl.tstruct(a=hl.tint32, b=hl.tint32, c=hl.tint32, d=hl.tint32, e=hl.tstr,
                            f=hl.tarray(hl.tint32),
                            g=hl.tarray(
                                hl.tstruct(x=hl.tint32, y=hl.tint32, z=hl.tstr)),
                            h=hl.tstruct(a=hl.tint32, b=hl.tint32, c=hl.tstr),
                            i=hl.tbool,
                            j=hl.tstruct(x=hl.tint32, y=hl.tint32, z=hl.tstr))

        rows = [{'a': 4, 'b': 1, 'c': 3, 'd': 5,
                 'e': "hello", 'f': [1, 2, 3],
                 'g': [hl.Struct(x=1, y=5, z='banana')],
                 'h': hl.Struct(a=5, b=3, c='winter'),
                 'i': True,
                 'j': hl.Struct(x=3, y=2, z='summer')}]

        kt = hl.Table.parallelize(rows, schema)

        result = convert_struct_to_dict(kt.annotate(
            chisq=hl.chisq(kt.a, kt.b, kt.c, kt.d),
            ctt=hl.ctt(kt.a, kt.b, kt.c, kt.d, 5),
            dict=hl.dict(hl.zip([kt.a, kt.b], [kt.c, kt.d])),
            dpois=hl.dpois(4, kt.a),
            drop=kt.h.drop('b', 'c'),
            exp=hl.exp(kt.c),
            fet=hl.fisher_exact_test(kt.a, kt.b, kt.c, kt.d),
            hwe=hl.hardy_weinberg_p(1, 2, 1),
            index=hl.index(kt.g, 'z'),
            is_defined=hl.is_defined(kt.i),
            is_missing=hl.is_missing(kt.i),
            is_nan=hl.is_nan(hl.float64(kt.a)),
            json=hl.json(kt.g),
            log=hl.log(kt.a, kt.b),
            log10=hl.log10(kt.c),
            or_else=hl.or_else(kt.a, 5),
            or_missing=hl.or_missing(kt.i, kt.j),
            pchisqtail=hl.pchisqtail(kt.a, kt.b),
            pcoin=hl.rand_bool(0.5),
            pnorm=hl.pnorm(0.2),
            pow=2.0 ** kt.b,
            ppois=hl.ppois(kt.a, kt.b),
            qchisqtail=hl.qchisqtail(kt.a, kt.b),
            range=hl.range(0, 5, kt.b),
            rnorm=hl.rand_norm(0.0, kt.b),
            rpois=hl.rand_pois(kt.a),
            runif=hl.rand_unif(kt.b, kt.a),
            select=kt.h.select('c', 'b'),
            sqrt=hl.sqrt(kt.a),
            to_str=[hl.str(5), hl.str(kt.a), hl.str(kt.g)],
            where=hl.cond(kt.i, 5, 10)
        ).take(1)[0])
Beispiel #7
0
    def compute_chet_log_like(n, p, q, x):
        res = (hl.cond((p > 0) & (q > 0),
                       hl.fold(
                           lambda i, j: i + j[0] * j[1], 0,
                           hl.zip(gt_counts, [
                               hl.log10(x) * 2,
                               hl.log10(2 * x * q),
                               hl.log10(q) * 2,
                               hl.log10(2 * x * p),
                               hl.log10(2 * (p * q + x * e)),
                               hl.log10(2 * q * e),
                               hl.log10(p) * 2,
                               hl.log10(2 * p * e),
                               hl.log10(e) * 2
                           ])), -1e-31))
        # If desired, add distance posterior based on value derived from regression
        if distance is not None:
            res = res + hl.max(-6,
                               hl.log10(0.03 + 0.03 * hl.log(distance - 1)))

        return res
def combine_phenotypes_with_name(mt: hl.MatrixTable,
                                 column_field,
                                 entry_field,
                                 dict_of_columns,
                                 new_col_name='grouping',
                                 new_entry_name='new_entry',
                                 grouping_function=hl.agg.any):
    """
    Group by non-unique fields and apply grouping_function in order to combine entries in MatrixTable.

    Example:

    mt = hl.balding_nichols_model(1, 4, 10)
    mt = mt.annotate_entries(pheno=hl.rand_bool(0.5))
    dict_of_columns = {'pheno01': [0, 1], 'pheno03': [0, 3]}
    entry_field = mt.pheno
    column_field = mt.sample_idx

    :param MatrixTable mt: Input MatrixTable
    :param Expression column_field: Column-indexed Expression to group by
    :param Expression entry_field: Entry-indexed Expression to which to apply `grouping_function`
    :param dict of any -> list dict_of_columns: Entry in the lists should be the same type as `column_field`
    :param str new_col_name: Name for new column key (default 'grouping')
    :param str new_entry_name: Name for new entry expression (default 'new_entry')
    :param function grouping_function: Aggregator function to apply to `entry_field` (default hl.agg.any)
    :return: Re-grouped MatrixTable
    :rtype: MatrixTable
    """
    dict_of_columns = hl.literal(dict_of_columns)
    mt = mt._annotate_all(col_exprs={'_col_expr': column_field},
                          entry_exprs={'_entry_expr': entry_field})
    mt = mt.annotate_cols(
        **{
            new_col_name:
            hl.zip(dict_of_columns.keys(), dict_of_columns.values()).filter(
                lambda x: x[1].contains(mt._col_expr)).map(lambda x: x[0])
        })
    mt = mt.explode_cols(new_col_name)
    return mt.group_cols_by(new_col_name).aggregate(
        **{new_entry_name: grouping_function(mt._entry_expr)})
Beispiel #9
0
    def merge_arrays(r_array, v_array):

        def rewrite_ref(r):
            ref_block_selector = {}
            for k, t in merged_schema.items():
                if k == 'LA':
                    ref_block_selector[k] = hl.literal([0])
                elif k in ('LGT', 'GT'):
                    ref_block_selector[k] = hl.call(0, 0)
                else:
                    ref_block_selector[k] = r[k] if k in r else hl.missing(t)
            return r.select(**ref_block_selector)

        def rewrite_var(v):
            return v.select(**{
                k: v[k] if k in v else hl.missing(t)
                for k, t in merged_schema.items()
            })

        return hl.case() \
            .when(hl.is_missing(r_array), v_array.map(rewrite_var)) \
            .when(hl.is_missing(v_array), r_array.map(rewrite_ref)) \
            .default(hl.zip(r_array, v_array).map(lambda t: hl.coalesce(rewrite_var(t[1]), rewrite_ref(t[0]))))
Beispiel #10
0
def main(args):
    # Init Hail
    hl.init(default_reference=args.default_ref_genome)

    # Import VEPed VCF file as MatrixTable and get VCF file meta-data
    # vcf_path = args.vcf_vep_path
    mt = hl.import_vcf(path=get_vep_vqsr_vcf_path(), force_bgz=args.force_bgz)

    # getting annotated VEP fields names from VCF-header
    vep_fields = get_vep_fields(vcf_path=get_vep_vqsr_vcf_path(),
                                vep_csq_field=args.csq_field)

    if args.split_multi_allelic:
        # split multi-allelic variants
        mt = hl.split_multi_hts(mt)

        # split/annotate fields in the info field (use allele index )
        mt = mt.annotate_rows(info=mt.info.annotate(
            **{field: mt.info[field][mt.a_index - 1]
               for field in INFO_FIELDS}))

    # parse/annotate the CSQ field in a different structure
    tb_csq = mt.rows()
    tb_csq = (tb_csq.annotate(csq_raw=tb_csq.info[args.csq_field]))

    # Convert/annotate all transcripts per variants with a structure of type array<dict<str, str>>.
    # The transcript(s) are represented as a dict<k,v>, where keys are the field names extracted from the VCF header and
    # the values are the current annotated values in the CSQ field.
    tb_csq = (tb_csq.annotate(csq_raw=tb_csq.csq_raw.map(
        lambda x: hl.dict(hl.zip(vep_fields, x.split('[|]'))))))

    # Keep transcript(s) matching with the allele index (only used if variant were split with split_multi_hts)
    # It requires having the flag "ALLELE_NUM" annotated by VEP
    # Apply only were the alleles were split.
    # TODO: Handle exception when the flag "ALLELE_NUM" is not present
    if all(
        [x in list(tb_csq._fields.keys()) for x in ['was_split', 'a_index']]):
        tb_csq = (tb_csq.annotate(csq_raw=hl.cond(
            tb_csq.was_split,
            tb_csq.csq_raw.filter(lambda x: (hl.int(x["ALLELE_NUM"]) == tb_csq.
                                             a_index)), tb_csq.csq_raw)))

    # select and annotate one transcript per variant based on pre-defined rules
    tb_csq = pick_transcript(
        ht=tb_csq,
        csq_array='csq_raw',
    )

    # Expand selected transcript (dict) annotations adding independent fields.
    tb_csq = annotate_from_dict(ht=tb_csq, dict_field='tx', output_filed='vep')

    # Parse the "Consequence" field. Keep only the more severe consequence.
    # Avoid the notation "consequence_1&consequence_2"
    tb_csq = (tb_csq.annotate(vep=tb_csq.vep.annotate(
        Consequence=tb_csq.vep.Consequence.split('&')[0])))

    # Parse the protein DOMAIN field
    if 'DOMAINS' in vep_fields:
        tb_csq = (tb_csq.annotate(vep=tb_csq.vep.annotate(
            DOMAINS=vep_protein_domain_ann_expr(tb_csq.vep['DOMAINS']))))

    # drop redundant/temp fields
    tb_csq = (tb_csq.drop('csq_raw', 'tx').repartition(500))

    # print fields overview
    tb_csq.describe()

    # write table as HailTable to disk
    # (tb_csq
    # .write(output=args.tb_output_path,
    #        overwrite=args.overwrite)
    # )

    output_path = get_variant_qc_ht_path(part='vep_vqsr',
                                         split=args.split_multi_allelic)
    tb_csq = (tb_csq.checkpoint(output=output_path, overwrite=args.overwrite))

    if args.write_to_file:
        # write table to disk as a BGZ-compressed TSV file
        (tb_csq.export(f'{output_path}.tsv.bgz'))

    # Stop Hail
    hl.stop()
Beispiel #11
0
def _to_expr(e, dtype):
    if e is None:
        return None
    elif isinstance(e, Expression):
        if e.dtype != dtype:
            assert is_numeric(dtype), 'expected {}, got {}'.format(
                dtype, e.dtype)
            if dtype == tfloat64:
                return hl.float64(e)
            elif dtype == tfloat32:
                return hl.float32(e)
            elif dtype == tint64:
                return hl.int64(e)
            else:
                assert dtype == tint32
                return hl.int32(e)
        return e
    elif not is_compound(dtype):
        # these are not container types and cannot contain expressions if we got here
        return e
    elif isinstance(dtype, tstruct):
        new_fields = []
        found_expr = False
        for f, t in dtype.items():
            value = _to_expr(e[f], t)
            found_expr = found_expr or isinstance(value, Expression)
            new_fields.append(value)

        if not found_expr:
            return e
        else:
            exprs = [
                new_fields[i] if isinstance(new_fields[i], Expression) else
                hl.literal(new_fields[i], dtype[i])
                for i in range(len(new_fields))
            ]
            fields = {name: expr for name, expr in zip(dtype.keys(), exprs)}
            from .typed_expressions import StructExpression
            return StructExpression._from_fields(fields)

    elif isinstance(dtype, tarray):
        elements = []
        found_expr = False
        for element in e:
            value = _to_expr(element, dtype.element_type)
            found_expr = found_expr or isinstance(value, Expression)
            elements.append(value)
        if not found_expr:
            return e
        else:
            assert len(elements) > 0
            exprs = [
                element if isinstance(element, Expression) else hl.literal(
                    element, dtype.element_type) for element in elements
            ]
            indices, aggregations = unify_all(*exprs)
        x = ir.MakeArray([e._ir for e in exprs], None)
        return expressions.construct_expr(x, dtype, indices, aggregations)
    elif isinstance(dtype, tset):
        elements = []
        found_expr = False
        for element in e:
            value = _to_expr(element, dtype.element_type)
            found_expr = found_expr or isinstance(value, Expression)
            elements.append(value)
        if not found_expr:
            return e
        else:
            assert len(elements) > 0
            exprs = [
                element if isinstance(element, Expression) else hl.literal(
                    element, dtype.element_type) for element in elements
            ]
            indices, aggregations = unify_all(*exprs)
            x = ir.ToSet(
                ir.ToStream(ir.MakeArray([e._ir for e in exprs], None)))
            return expressions.construct_expr(x, dtype, indices, aggregations)
    elif isinstance(dtype, ttuple):
        elements = []
        found_expr = False
        assert len(e) == len(dtype.types)
        for i in range(len(e)):
            value = _to_expr(e[i], dtype.types[i])
            found_expr = found_expr or isinstance(value, Expression)
            elements.append(value)
        if not found_expr:
            return e
        else:
            exprs = [
                elements[i] if isinstance(elements[i], Expression) else
                hl.literal(elements[i], dtype.types[i])
                for i in range(len(elements))
            ]
            indices, aggregations = unify_all(*exprs)
            x = ir.MakeTuple([expr._ir for expr in exprs])
            return expressions.construct_expr(x, dtype, indices, aggregations)
    elif isinstance(dtype, tdict):
        keys = []
        values = []
        found_expr = False
        for k, v in e.items():
            k_ = _to_expr(k, dtype.key_type)
            v_ = _to_expr(v, dtype.value_type)
            found_expr = found_expr or isinstance(k_, Expression)
            found_expr = found_expr or isinstance(v_, Expression)
            keys.append(k_)
            values.append(v_)
        if not found_expr:
            return e
        else:
            assert len(keys) > 0
            # Here I use `to_expr` to call `lit` the keys and values separately.
            # I anticipate a common mode is statically-known keys and Expression
            # values.
            key_array = to_expr(keys, tarray(dtype.key_type))
            value_array = to_expr(values, tarray(dtype.value_type))
            return hl.dict(hl.zip(key_array, value_array))
    elif isinstance(dtype, hl.tndarray):
        return hl.nd.array(e)
    else:
        raise NotImplementedError(dtype)
Beispiel #12
0
def _blanczos_pca(entry_expr,
                  k=10,
                  compute_loadings=False,
                  q_iterations=2,
                  oversampling_param=2,
                  block_size=128):
    r"""Run randomized principal component analysis approximation (PCA)
    on numeric columns derived from a matrix table.

    Implements the Blanczos algorithm found by Rokhlin, Szlam, and Tygert.

    Examples
    --------

    For a matrix table with variant rows, sample columns, and genotype entries,
    compute the top 2 PC sample scores and eigenvalues of the matrix of 0s and
    1s encoding missingness of genotype calls.

    >>> eigenvalues, scores, _ = hl._blanczos_pca(hl.int(hl.is_defined(dataset.GT)),
    ...                                 k=2)

    Warning
    -------
      This method does **not** automatically mean-center or normalize each column.
      If desired, such transformations should be incorporated in `entry_expr`.

      Hail will return an error if `entry_expr` evaluates to missing, nan, or
      infinity on any entry.

    Notes
    -----

    PCA is run on the columns of the numeric matrix obtained by evaluating
    `entry_expr` on each entry of the matrix table, or equivalently on the rows
    of the **transposed** numeric matrix :math:`M` referenced below.

    PCA computes the SVD

    .. math::

      M = USV^T

    where columns of :math:`U` are left singular vectors (orthonormal in
    :math:`\mathbb{R}^n`), columns of :math:`V` are right singular vectors
    (orthonormal in :math:`\mathbb{R}^m`), and :math:`S=\mathrm{diag}(s_1, s_2,
    \ldots)` with ordered singular values :math:`s_1 \ge s_2 \ge \cdots \ge 0`.
    Typically one computes only the first :math:`k` singular vectors and values,
    yielding the best rank :math:`k` approximation :math:`U_k S_k V_k^T` of
    :math:`M`; the truncations :math:`U_k`, :math:`S_k` and :math:`V_k` are
    :math:`n \times k`, :math:`k \times k` and :math:`m \times k`
    respectively.

    From the perspective of the rows of :math:`M` as samples (data points),
    :math:`V_k` contains the loadings for the first :math:`k` PCs while
    :math:`MV_k = U_k S_k` contains the first :math:`k` PC scores of each
    sample. The loadings represent a new basis of features while the scores
    represent the projected data on those features. The eigenvalues of the Gramian
    :math:`MM^T` are the squares of the singular values :math:`s_1^2, s_2^2,
    \ldots`, which represent the variances carried by the respective PCs. By
    default, Hail only computes the loadings if the ``loadings`` parameter is
    specified.

    Scores are stored in a :class:`.Table` with the column key of the matrix
    table as key and a field `scores` of type ``array<float64>`` containing
    the principal component scores.

    Loadings are stored in a :class:`.Table` with the row key of the matrix
    table as key and a field `loadings` of type ``array<float64>`` containing
    the principal component loadings.

    The eigenvalues are returned in descending order, with scores and loadings
    given the corresponding array order.

    Parameters
    ----------
    entry_expr : :class:`.Expression`
        Numeric expression for matrix entries.
    k : :obj:`int`
        Number of principal components.
    compute_loadings : :obj:`bool`
        If ``True``, compute row loadings.
    q_iterations : :obj:`int`
        Number of rounds of power iteration to amplify singular values.
    oversampling_param : :obj:`int`
        Amount of oversampling to use when approximating the singular values.
        Usually a value between `0 <= oversampling_param <= k`.

    Returns
    -------
    (:obj:`list` of :obj:`float`, :class:`.Table`, :class:`.Table`)
        List of eigenvalues, table with column scores, table with row loadings.
    """
    check_entry_indexed('mt_to_table_of_ndarray/entry_expr', entry_expr)
    mt = matrix_table_source('pca/entry_expr', entry_expr)

    A, ht = mt_to_table_of_ndarray(entry_expr,
                                   block_size,
                                   return_checkpointed_table_also=True)
    A = A.persist()

    # Set Parameters

    q = q_iterations
    L = k + oversampling_param
    n = A.take(1)[0].ndarray.shape[1]

    # Generate random matrix G
    G = hl.nd.zeros((n, L)).map(lambda n: hl.rand_norm(0, 1))

    def hailBlanczos(A, G, k, q):

        h_list = []
        G_i = hl.nd.qr(G)[0]

        for j in range(0, q):
            info(f"blanczos_pca: Beginning iteration {j + 1}/{q+1}")
            temp = A.annotate(H_i=A.ndarray @ G_i)
            temp = temp.annotate(G_i_intermediate=temp.ndarray.T @ temp.H_i)
            result = temp.aggregate(hl.struct(
                Hi_chunks=hl.agg.collect(temp.H_i),
                G_i=hl.agg.ndarray_sum(temp.G_i_intermediate)),
                                    _localize=False)._persist()
            localized_H_i = hl.nd.vstack(result.Hi_chunks)
            h_list.append(localized_H_i)
            G_i = hl.nd.qr(result.G_i)[0]

        info(f"blanczos_pca: Beginning iteration {q+ 1}/{q+1}")
        temp = A.annotate(H_i=A.ndarray @ G_i)
        result = temp.aggregate(hl.agg.collect(temp.H_i),
                                _localize=False)._persist()
        info("blanczos_pca: Iterations complete. Computing local QR")
        localized_H_i = hl.nd.vstack(result)
        h_list.append(localized_H_i)
        H = hl.nd.hstack(h_list)
        Q = hl.nd.qr(H)[0]._persist()
        A = A.annotate(part_size=A.ndarray.shape[0])
        A = A.annotate(rows_preceeding=hl.int32(hl.scan.sum(A.part_size)))
        A = A.annotate_globals(Qt=Q.T)
        T = A.annotate(ndarray=A.Qt[:, A.rows_preceeding:A.rows_preceeding +
                                    A.part_size] @ A.ndarray)
        arr_T = T.aggregate(hl.agg.ndarray_sum(T.ndarray), _localize=False)

        info("blanczos_pca: QR Complete. Computing local SVD")
        U, S, W = hl.nd.svd(arr_T, full_matrices=False)._persist()

        V = Q @ U

        truncV = V[:, :k]
        truncS = S[:k]
        truncW = W[:k, :]

        return truncV, truncS, truncW

    U, S, V = hailBlanczos(A, G, k, q)

    scores = V.transpose() * S
    eigens = hl.eval(S * S)
    info("blanczos_pca: SVD Complete. Computing conversion to PCs.")

    hail_array_scores = scores._data_array()
    cols_and_scores = hl.zip(
        A.index_globals().cols,
        hail_array_scores).map(lambda tup: tup[0].annotate(scores=tup[1]))
    st = hl.Table.parallelize(cols_and_scores, key=list(mt.col_key))

    lt = ht.select()
    lt = lt.annotate_globals(U=U)
    idx_name = '_tmp_pca_loading_index'
    lt = lt.add_index(idx_name)
    lt = lt.annotate(
        loadings=lt.U[lt[idx_name], :]._data_array()).select_globals()
    lt = lt.drop(lt[idx_name])

    if compute_loadings:
        return eigens, st, lt
    else:
        return eigens, st, None
Beispiel #13
0
def main(args):
    # Init Hail with hg38 genome build as default
    hl.init(default_reference=args.default_ref_genome)

    # Import VEPed VCF file as MatrixTable and get VCF file meta-data
    vcf_path = args.vcf_vep_path
    mt = hl.import_vcf(path=vcf_path, force_bgz=args.force_bgz)

    # getting annotated VEP fields names from VCF-header
    vep_fields = get_vep_fields(vcf_path=vcf_path,
                                vep_csq_field=args.csq_field)

    if args.exclude_multi_allelic:
        # TODO: This option should skip the split_multi step...
        # Filter out multi-allelic variants. Keep only bi-allelic
        mt = filter_biallelic(mt)

    # split multi-allelic variants
    mt = hl.split_multi_hts(mt)

    # flatten nested structure (e.g. 'info') and get a HailTable with all rows fields
    tb_csq = (mt.rows().flatten().key_by('locus', 'alleles'))

    # rename info[CSQ] field to 'csq_array'.
    # Simpler field name are easier to parse later...
    tb_csq = (tb_csq.rename({'info.' + args.csq_field: 'csq_array'}))

    # Convert/annotate all transcripts per variants with a structure of type array<dict<str, str>>.
    # The transcript(s) are represented as a dict<k,v>, the keys are the field names extracted from the VCF header, the
    # values are the current annotated values in the CSQ field.
    tb_csq = (tb_csq.annotate(csq_array=tb_csq.csq_array.map(
        lambda x: hl.dict(hl.zip(vep_fields, x.split('[|]'))))))

    # Keep transcript(s) matching with the allele index.
    # It requires having the flag "ALLELE_NUM" annotated by VEP
    # Apply only were the alleles were split.
    # TODO: Handle exception when the flag "ALLELE_NUM" is not present
    tb_csq = (tb_csq.annotate(csq_array=hl.cond(
        tb_csq.was_split,
        tb_csq.csq_array.filter(lambda x: (hl.int(x["ALLELE_NUM"]) == tb_csq.
                                           a_index)), tb_csq.csq_array)))

    # select and annotate one transcript per variant based on pre-defined rules
    tb_csq = pick_transcript(ht=tb_csq, csq_array='csq_array')

    # Expand selected transcript (dict) annotations adding independent fields.
    tb_csq = annotate_from_dict(ht=tb_csq, dict_field='tx')

    # Parse the "Consequence" field. Keep only the more severe consequence.
    # Avoid the notation "consequence_1&consequence_2"
    tb_csq = (tb_csq.transmute(Consequence=tb_csq.Consequence.split('&')[0]))

    # print fields overview
    tb_csq.describe()

    # drop unnecessary fields
    tb_csq = (tb_csq.drop('csq_array', 'tx'))

    # write table as HailTable to disk
    (tb_csq.write(output=args.tb_output_path))

    if args.write_to_file:
        # write table to disk as a BGZ-compressed TSV file
        (tb_csq.export(args.tb_output_path + '.tsv.bgz'))

    # Stop Hail
    hl.stop()
Beispiel #14
0
def _blanczos_pca(A,
                  k=10,
                  compute_loadings=False,
                  q_iterations=2,
                  oversampling_param=2,
                  block_size=128):
    r"""Run randomized principal component analysis approximation (PCA)
    on numeric columns derived from a matrix table.

    Implements the Blanczos algorithm found by Rokhlin, Szlam, and Tygert.

    Examples
    --------

    For a matrix table with variant rows, sample columns, and genotype entries,
    compute the top 2 PC sample scores and eigenvalues of the matrix of 0s and
    1s encoding missingness of genotype calls.

    >>> eigenvalues, scores, _ = hl._blanczos_pca(hl.int(hl.is_defined(dataset.GT)),
    ...                                 k=2)

    Warning
    -------
      This method does **not** automatically mean-center or normalize each column.
      If desired, such transformations should be incorporated in `entry_expr`.

      Hail will return an error if `entry_expr` evaluates to missing, nan, or
      infinity on any entry.

    Notes
    -----

    PCA is run on the columns of the numeric matrix obtained by evaluating
    `entry_expr` on each entry of the matrix table, or equivalently on the rows
    of the **transposed** numeric matrix :math:`M` referenced below.

    PCA computes the SVD

    .. math::

      M = USV^T

    where columns of :math:`U` are left singular vectors (orthonormal in
    :math:`\mathbb{R}^n`), columns of :math:`V` are right singular vectors
    (orthonormal in :math:`\mathbb{R}^m`), and :math:`S=\mathrm{diag}(s_1, s_2,
    \ldots)` with ordered singular values :math:`s_1 \ge s_2 \ge \cdots \ge 0`.
    Typically one computes only the first :math:`k` singular vectors and values,
    yielding the best rank :math:`k` approximation :math:`U_k S_k V_k^T` of
    :math:`M`; the truncations :math:`U_k`, :math:`S_k` and :math:`V_k` are
    :math:`n \times k`, :math:`k \times k` and :math:`m \times k`
    respectively.

    From the perspective of the rows of :math:`M` as samples (data points),
    :math:`V_k` contains the loadings for the first :math:`k` PCs while
    :math:`MV_k = U_k S_k` contains the first :math:`k` PC scores of each
    sample. The loadings represent a new basis of features while the scores
    represent the projected data on those features. The eigenvalues of the Gramian
    :math:`MM^T` are the squares of the singular values :math:`s_1^2, s_2^2,
    \ldots`, which represent the variances carried by the respective PCs. By
    default, Hail only computes the loadings if the ``loadings`` parameter is
    specified.

    Scores are stored in a :class:`.Table` with the column key of the matrix
    table as key and a field `scores` of type ``array<float64>`` containing
    the principal component scores.

    Loadings are stored in a :class:`.Table` with the row key of the matrix
    table as key and a field `loadings` of type ``array<float64>`` containing
    the principal component loadings.

    The eigenvalues are returned in descending order, with scores and loadings
    given the corresponding array order.

    Parameters
    ----------
    entry_expr : :class:`.Expression`
        Numeric expression for matrix entries.
    k : :obj:`int`
        Number of principal components.
    compute_loadings : :obj:`bool`
        If ``True``, compute row loadings.
    q_iterations : :obj:`int`
        Number of rounds of power iteration to amplify singular values.
    oversampling_param : :obj:`int`
        Amount of oversampling to use when approximating the singular values.
        Usually a value between `0 <= oversampling_param <= k`.

    Returns
    -------
    (:obj:`list` of :obj:`float`, :class:`.Table`, :class:`.Table`)
        List of eigenvalues, table with column scores, table with row loadings.
    """
    if not isinstance(A, TallSkinnyMatrix):
        check_entry_indexed('_blanczos_pca/entry_expr', A)
        A = _make_tsm(A, block_size)

    U, S, V = _reduced_svd(A, k, compute_loadings, q_iterations,
                           k + oversampling_param)

    scores = V * S
    eigens = hl.eval(S * S)
    info("blanczos_pca: SVD Complete. Computing conversion to PCs.")

    hail_array_scores = scores._data_array()
    cols_and_scores = hl.zip(
        A.source_table.index_globals().cols,
        hail_array_scores).map(lambda tup: tup[0].annotate(scores=tup[1]))
    st = hl.Table.parallelize(cols_and_scores, key=A.col_key)

    if compute_loadings:
        lt = A.source_table.select()
        lt = lt.annotate_globals(U=U)
        idx_name = '_tmp_pca_loading_index'
        lt = lt.add_index(idx_name)
        lt = lt.annotate(
            loadings=lt.U[lt[idx_name], :]._data_array()).select_globals()
        lt = lt.drop(lt[idx_name])
        return eigens, st, lt
    else:
        return eigens, st, None
Beispiel #15
0
def _pca_and_moments(A,
                     k=10,
                     num_moments=5,
                     compute_loadings=False,
                     q_iterations=2,
                     oversampling_param=2,
                     block_size=128,
                     moment_samples=100):
    if not isinstance(A, TallSkinnyMatrix):
        check_entry_indexed('_spectral_moments/entry_expr', A)
        A = _make_tsm_from_call(A, block_size)

    # Set Parameters
    q = q_iterations
    L = k + oversampling_param
    n = A.ncols

    # Generate random matrix G
    G = hl.nd.zeros((n, L)).map(lambda n: hl.rand_norm(0, 1))
    G = hl.nd.qr(G)[0]._persist()

    fact = _krylov_factorization(A, G, q, compute_loadings)
    info("_reduced_svd: Computing local SVD")
    U, S, V = fact.reduced_svd(k)

    p = min(num_moments // 2, 10)

    # Generate random matrix G2 for moment estimation
    G2 = hl.nd.zeros(
        (n,
         moment_samples)).map(lambda n: hl.if_else(hl.rand_bool(0.5), -1, 1))
    # Project out components in subspace fact.V, which we can compute exactly
    G2 = G2 - fact.V @ (fact.V.T @ G2)
    Q1, R1 = hl.nd.qr(G2)._persist()
    fact2 = _krylov_factorization(A, Q1, p, compute_U=False)
    moments_and_stdevs = fact2.spectral_moments(num_moments, R1)
    # Add back exact moments
    moments = moments_and_stdevs.moments + hl.nd.array([
        fact.S.map(lambda x: x**(2 * i)).sum()
        for i in range(1, num_moments + 1)
    ])
    moments_and_stdevs = hl.eval(
        hl.struct(moments=moments, stdevs=moments_and_stdevs.stdevs))
    moments = moments_and_stdevs.moments
    stdevs = moments_and_stdevs.stdevs

    scores = V * S
    eigens = hl.eval(S * S)
    info("blanczos_pca: SVD Complete. Computing conversion to PCs.")

    hail_array_scores = scores._data_array()
    cols_and_scores = hl.zip(
        A.source_table.index_globals().cols,
        hail_array_scores).map(lambda tup: tup[0].annotate(scores=tup[1]))
    st = hl.Table.parallelize(cols_and_scores, key=A.col_key)

    if compute_loadings:
        lt = A.source_table.select()
        lt = lt.annotate_globals(U=U)
        idx_name = '_tmp_pca_loading_index'
        lt = lt.add_index(idx_name)
        lt = lt.annotate(
            loadings=lt.U[lt[idx_name], :]._data_array()).select_globals()
        lt = lt.drop(lt[idx_name])
    else:
        lt = None

    return eigens, st, lt, moments, stdevs
Beispiel #16
0
def test_to_dense_mt():
    vds = hl.vds.read_vds(
        os.path.join(resource('vds'), '1kg_2samples_starts.vds'))
    vds = hl.vds.filter_chromosomes(vds, keep='chr22')

    dense = hl.vds.to_dense_mt(vds).select_entries('LGT', 'LA', 'GQ', 'DP')

    assert dense.rows().select()._same(vds.variant_data.rows().select(
    )), "rows differ between variant data and dense mt"

    assert dense.filter_entries(hl.is_defined(dense.LA))._same(
        vds.variant_data.select_entries('LGT', 'LA', 'GQ',
                                        'DP')), "cannot recover variant data"

    as_dict = dense.aggregate_entries(
        hl.dict(
            hl.zip(hl.agg.collect((hl.str(dense.locus), dense.s)),
                   hl.agg.collect(dense.entry))))

    assert as_dict.get(('chr22:10514784', 'NA12891')) == None
    assert as_dict.get(
        ('chr22:10514784', 'NA12878')) == hl.Struct(LGT=hl.Call([0, 1]),
                                                    LA=[0, 1],
                                                    GQ=23,
                                                    DP=4)

    assert as_dict.get(
        ('chr22:10516150', 'NA12891')) == hl.Struct(LGT=hl.Call([0, 1]),
                                                    LA=[0, 1],
                                                    GQ=64,
                                                    DP=4)
    assert as_dict.get(
        ('chr22:10516150', 'NA12878')) == hl.Struct(LGT=hl.Call([0, 1]),
                                                    LA=[0, 1],
                                                    GQ=99,
                                                    DP=10)

    assert as_dict.get(
        ('chr22:10519088', 'NA12891')) == hl.Struct(LGT=hl.Call([0, 1]),
                                                    LA=[0, 1],
                                                    GQ=99,
                                                    DP=21)
    assert as_dict.get(('chr22:10519088', 'NA12878')) == None

    assert as_dict.get(
        ('chr22:10562435', 'NA12891')) == hl.Struct(LGT=hl.Call([0, 1]),
                                                    LA=[0, 1],
                                                    GQ=99,
                                                    DP=15)
    assert as_dict.get(
        ('chr22:10562435', 'NA12878')) == hl.Struct(LGT=hl.Call([0, 0]),
                                                    LA=None,
                                                    GQ=21,
                                                    DP=9)

    assert as_dict.get(
        ('chr22:10562436', 'NA12891')) == hl.Struct(LGT=hl.Call([0, 1]),
                                                    LA=[0, 1],
                                                    GQ=99,
                                                    DP=15)
    assert as_dict.get(
        ('chr22:10562436', 'NA12878')) == hl.Struct(LGT=hl.Call([0, 0]),
                                                    LA=None,
                                                    GQ=21,
                                                    DP=9)