Exemple #1
0
def _make_tsm(entry_expr, block_size):
    mt = matrix_table_source('_make_tsm/entry_expr', entry_expr)
    A, ht = mt_to_table_of_ndarray(entry_expr,
                                   block_size,
                                   return_checkpointed_table_also=True)
    A = A.persist()
    return TallSkinnyMatrix(A, A.ndarray, ht, list(mt.col_key))
Exemple #2
0
def _make_tsm_from_call(call_expr,
                        block_size,
                        mean_center=False,
                        hwe_normalize=False):
    mt = matrix_table_source('_make_tsm/entry_expr', call_expr)
    mt = mt.select_entries(__gt=call_expr.n_alt_alleles())
    if mean_center or hwe_normalize:
        mt = mt.annotate_rows(__AC=agg.sum(mt.__gt),
                              __n_called=agg.count_where(hl.is_defined(
                                  mt.__gt)))
        mt = mt.filter_rows((mt.__AC > 0) & (mt.__AC < 2 * mt.__n_called))

        n_variants = mt.count_rows()
        if n_variants == 0:
            raise FatalError(
                "_make_tsm: found 0 variants after filtering out monomorphic sites."
            )
        info(
            f"_make_tsm: found {n_variants} variants after filtering out monomorphic sites."
        )

        mt = mt.annotate_rows(__mean_gt=mt.__AC / mt.__n_called)
        mt = mt.unfilter_entries()

        mt = mt.select_entries(__x=hl.or_else(mt.__gt - mt.__mean_gt, 0.0))

        if hwe_normalize:
            mt = mt.annotate_rows(
                __hwe_scaled_std_dev=hl.sqrt(mt.__mean_gt *
                                             (2 - mt.__mean_gt) * n_variants /
                                             2))
            mt = mt.select_entries(__x=mt.__x / mt.__hwe_scaled_std_dev)
    else:
        mt = mt.select_entries(__x=mt.__gt)

    A, ht = mt_to_table_of_ndarray(mt.__x,
                                   block_size,
                                   return_checkpointed_table_also=True)
    A = A.persist()
    return TallSkinnyMatrix(A, A.ndarray, ht, list(mt.col_key))
Exemple #3
0
def _blanczos_pca(entry_expr,
                  k=10,
                  compute_loadings=False,
                  q_iterations=2,
                  oversampling_param=2,
                  block_size=128):
    r"""Run randomized principal component analysis approximation (PCA)
    on numeric columns derived from a matrix table.

    Implements the Blanczos algorithm found by Rokhlin, Szlam, and Tygert.

    Examples
    --------

    For a matrix table with variant rows, sample columns, and genotype entries,
    compute the top 2 PC sample scores and eigenvalues of the matrix of 0s and
    1s encoding missingness of genotype calls.

    >>> eigenvalues, scores, _ = hl._blanczos_pca(hl.int(hl.is_defined(dataset.GT)),
    ...                                 k=2)

    Warning
    -------
      This method does **not** automatically mean-center or normalize each column.
      If desired, such transformations should be incorporated in `entry_expr`.

      Hail will return an error if `entry_expr` evaluates to missing, nan, or
      infinity on any entry.

    Notes
    -----

    PCA is run on the columns of the numeric matrix obtained by evaluating
    `entry_expr` on each entry of the matrix table, or equivalently on the rows
    of the **transposed** numeric matrix :math:`M` referenced below.

    PCA computes the SVD

    .. math::

      M = USV^T

    where columns of :math:`U` are left singular vectors (orthonormal in
    :math:`\mathbb{R}^n`), columns of :math:`V` are right singular vectors
    (orthonormal in :math:`\mathbb{R}^m`), and :math:`S=\mathrm{diag}(s_1, s_2,
    \ldots)` with ordered singular values :math:`s_1 \ge s_2 \ge \cdots \ge 0`.
    Typically one computes only the first :math:`k` singular vectors and values,
    yielding the best rank :math:`k` approximation :math:`U_k S_k V_k^T` of
    :math:`M`; the truncations :math:`U_k`, :math:`S_k` and :math:`V_k` are
    :math:`n \times k`, :math:`k \times k` and :math:`m \times k`
    respectively.

    From the perspective of the rows of :math:`M` as samples (data points),
    :math:`V_k` contains the loadings for the first :math:`k` PCs while
    :math:`MV_k = U_k S_k` contains the first :math:`k` PC scores of each
    sample. The loadings represent a new basis of features while the scores
    represent the projected data on those features. The eigenvalues of the Gramian
    :math:`MM^T` are the squares of the singular values :math:`s_1^2, s_2^2,
    \ldots`, which represent the variances carried by the respective PCs. By
    default, Hail only computes the loadings if the ``loadings`` parameter is
    specified.

    Scores are stored in a :class:`.Table` with the column key of the matrix
    table as key and a field `scores` of type ``array<float64>`` containing
    the principal component scores.

    Loadings are stored in a :class:`.Table` with the row key of the matrix
    table as key and a field `loadings` of type ``array<float64>`` containing
    the principal component loadings.

    The eigenvalues are returned in descending order, with scores and loadings
    given the corresponding array order.

    Parameters
    ----------
    entry_expr : :class:`.Expression`
        Numeric expression for matrix entries.
    k : :obj:`int`
        Number of principal components.
    compute_loadings : :obj:`bool`
        If ``True``, compute row loadings.
    q_iterations : :obj:`int`
        Number of rounds of power iteration to amplify singular values.
    oversampling_param : :obj:`int`
        Amount of oversampling to use when approximating the singular values.
        Usually a value between `0 <= oversampling_param <= k`.

    Returns
    -------
    (:obj:`list` of :obj:`float`, :class:`.Table`, :class:`.Table`)
        List of eigenvalues, table with column scores, table with row loadings.
    """
    check_entry_indexed('mt_to_table_of_ndarray/entry_expr', entry_expr)
    mt = matrix_table_source('pca/entry_expr', entry_expr)

    A, ht = mt_to_table_of_ndarray(entry_expr,
                                   block_size,
                                   return_checkpointed_table_also=True)
    A = A.persist()

    # Set Parameters

    q = q_iterations
    L = k + oversampling_param
    n = A.take(1)[0].ndarray.shape[1]

    # Generate random matrix G
    G = hl.nd.zeros((n, L)).map(lambda n: hl.rand_norm(0, 1))

    def hailBlanczos(A, G, k, q):

        h_list = []
        G_i = hl.nd.qr(G)[0]

        for j in range(0, q):
            info(f"blanczos_pca: Beginning iteration {j + 1}/{q+1}")
            temp = A.annotate(H_i=A.ndarray @ G_i)
            temp = temp.annotate(G_i_intermediate=temp.ndarray.T @ temp.H_i)
            result = temp.aggregate(hl.struct(
                Hi_chunks=hl.agg.collect(temp.H_i),
                G_i=hl.agg.ndarray_sum(temp.G_i_intermediate)),
                                    _localize=False)._persist()
            localized_H_i = hl.nd.vstack(result.Hi_chunks)
            h_list.append(localized_H_i)
            G_i = hl.nd.qr(result.G_i)[0]

        info(f"blanczos_pca: Beginning iteration {q+ 1}/{q+1}")
        temp = A.annotate(H_i=A.ndarray @ G_i)
        result = temp.aggregate(hl.agg.collect(temp.H_i),
                                _localize=False)._persist()
        info("blanczos_pca: Iterations complete. Computing local QR")
        localized_H_i = hl.nd.vstack(result)
        h_list.append(localized_H_i)
        H = hl.nd.hstack(h_list)
        Q = hl.nd.qr(H)[0]._persist()
        A = A.annotate(part_size=A.ndarray.shape[0])
        A = A.annotate(rows_preceeding=hl.int32(hl.scan.sum(A.part_size)))
        A = A.annotate_globals(Qt=Q.T)
        T = A.annotate(ndarray=A.Qt[:, A.rows_preceeding:A.rows_preceeding +
                                    A.part_size] @ A.ndarray)
        arr_T = T.aggregate(hl.agg.ndarray_sum(T.ndarray), _localize=False)

        info("blanczos_pca: QR Complete. Computing local SVD")
        U, S, W = hl.nd.svd(arr_T, full_matrices=False)._persist()

        V = Q @ U

        truncV = V[:, :k]
        truncS = S[:k]
        truncW = W[:k, :]

        return truncV, truncS, truncW

    U, S, V = hailBlanczos(A, G, k, q)

    scores = V.transpose() * S
    eigens = hl.eval(S * S)
    info("blanczos_pca: SVD Complete. Computing conversion to PCs.")

    hail_array_scores = scores._data_array()
    cols_and_scores = hl.zip(
        A.index_globals().cols,
        hail_array_scores).map(lambda tup: tup[0].annotate(scores=tup[1]))
    st = hl.Table.parallelize(cols_and_scores, key=list(mt.col_key))

    lt = ht.select()
    lt = lt.annotate_globals(U=U)
    idx_name = '_tmp_pca_loading_index'
    lt = lt.add_index(idx_name)
    lt = lt.annotate(
        loadings=lt.U[lt[idx_name], :]._data_array()).select_globals()
    lt = lt.drop(lt[idx_name])

    if compute_loadings:
        return eigens, st, lt
    else:
        return eigens, st, None