Beispiel #1
0
def correlation_reduce_cpu(v: ArrayLike,
                           out: ArrayLike) -> None:  # pragma: no cover
    """Corresponding "reduce" function for pearson correlation
    Parameters
    ----------
    v
        The correlation array on which pearson corrections has been
        applied on chunks
    out
        An ndarray, which is a symmetric matrix of pearson correlation

    Returns
    -------
    An ndarray, which contains the result of the calculation of the application
    of euclidean distance on all the chunks.
    """
    v = v.sum(axis=0)
    n = v[5]
    num = n * v[4] - v[0] * v[1]
    denom1 = np.sqrt(n * v[2] - v[0]**2)
    denom2 = np.sqrt(n * v[3] - v[1]**2)
    denom = denom1 * denom2
    value = np.nan
    if denom > 0:
        value = 1 - (num / denom)
    out[0] = value
Beispiel #2
0
 def _aggregate(x_chunk: ArrayLike, **_: typing.Any) -> ArrayLike:
     """Last function to be executed when resolving the dask graph,
     producing the final output. It is always invoked, even when the reduced
     Array counts a single chunk along the reduced axes."""
     x_chunk = x_chunk.reshape(x_chunk.shape[:-2] + (-1, n_map_param))
     result: ArrayLike = metric_reduce_func(x_chunk)
     return result
Beispiel #3
0
def _Garud_h_cohorts(gt: ArrayLike, sample_cohort: ArrayLike, n_cohorts: int,
                     ct: ArrayLike) -> ArrayLike:
    # transpose to hash columns (haplotypes)
    haplotypes = hash_array(gt.transpose()).transpose().flatten()
    arr = np.full((n_cohorts, N_GARUD_H_STATS), np.nan)
    for c in np.nditer(ct):
        arr[c, :] = _Garud_h(haplotypes[sample_cohort == c])
    return arr
Beispiel #4
0
 def _combine(x_chunk: ArrayLike, **_: typing.Any) -> ArrayLike:
     """Function used for intermediate recursive aggregation (see
     split_every argument to ``da.reduction below``).  If the
     reduction can be performed in less than 3 steps, it will
     not be invoked at all."""
     # reduce chunks by summing along the -2 axis
     x_chunk_reshaped = x_chunk.reshape(x_chunk.shape[:-2] +
                                        (-1, n_map_param))
     return x_chunk_reshaped.sum(axis=-2)[..., np.newaxis]
Beispiel #5
0
def _divergence(ac: ArrayLike, out: ArrayLike) -> None:  # pragma: no cover
    """Generalized U-function for computing divergence.

    Parameters
    ----------
    ac
        Allele counts of shape (cohorts, alleles) containing per-cohort allele counts.
    out
        Pairwise divergence stats with shape (cohorts, cohorts), where the entry at
        (i, j) is the divergence between cohort i and cohort j.
    """
    an = ac.sum(axis=-1)
    out[:, :] = np.nan  # (cohorts, cohorts)
    n_cohorts = ac.shape[0]
    n_alleles = ac.shape[1]
    # calculate the divergence for each cohort pair
    for i in range(n_cohorts):
        for j in range(i + 1, n_cohorts):
            n_pairs = an[i] * an[j]
            if n_pairs != 0.0:
                n_same = 0
                for k in range(n_alleles):
                    n_same += ac[i, k] * ac[j, k]
                n_diff = n_pairs - n_same
                div = n_diff / n_pairs
                out[i, j] = div
                out[j, i] = div

    # calculate the diversity for each cohort
    for i in range(n_cohorts):
        n_pairs = an[i] * (an[i] - 1)
        n_same = 0
        for k in range(n_alleles):
            n_same += ac[i, k] * (ac[i, k] - 1)
        n_diff = n_pairs - n_same
        if n_pairs != 0.0:
            div = n_diff / n_pairs
            out[i, i] = div
Beispiel #6
0
def _Garud_h(haplotypes: ArrayLike) -> ArrayLike:
    # find haplotype counts (sorted in descending order)
    counts = sorted(collections.Counter(haplotypes.tolist()).values(),
                    reverse=True)
    counts = np.array(counts)

    # find haplotype frequencies
    n = haplotypes.shape[0]
    f = counts / n

    # compute H1
    h1 = np.sum(f**2)

    # compute H12
    h12 = np.sum(f[:2])**2 + np.sum(f[2:]**2)

    # compute H123
    h123 = np.sum(f[:3])**2 + np.sum(f[3:]**2)

    # compute H2/H1
    h2 = h1 - f[0]**2
    h2_h1 = h2 / h1

    return np.array([h1, h12, h123, h2_h1])
Beispiel #7
0
def to_fixlen_str_array(arr: ArrayLike,
                        kind: Literal["S", "U"] = "S") -> ArrayLike:
    length = int(max_str_len(arr))
    return arr.astype(f"{kind}{length}")
Beispiel #8
0
 def max_str_len(arr: ArrayLike) -> Any:
     return arr.map_blocks(
         lambda s: np.char.str_len(s.astype(str)), dtype=np.int8
     ).max()